LLVM 11.0.0
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
31#include "llvm/ADT/Triple.h"
32#include "llvm/ADT/Twine.h"
48#include "llvm/IR/Attributes.h"
49#include "llvm/IR/Constants.h"
50#include "llvm/IR/DataLayout.h"
51#include "llvm/IR/DebugLoc.h"
53#include "llvm/IR/Function.h"
55#include "llvm/IR/GlobalValue.h"
56#include "llvm/IR/IRBuilder.h"
57#include "llvm/IR/Instruction.h"
60#include "llvm/IR/Intrinsics.h"
61#include "llvm/IR/IntrinsicsAArch64.h"
62#include "llvm/IR/Module.h"
65#include "llvm/IR/Type.h"
66#include "llvm/IR/Use.h"
67#include "llvm/IR/Value.h"
73#include "llvm/Support/Debug.h"
81#include <algorithm>
82#include <bitset>
83#include <cassert>
84#include <cctype>
85#include <cstdint>
86#include <cstdlib>
87#include <iterator>
88#include <limits>
89#include <tuple>
90#include <utility>
91#include <vector>
92
93using namespace llvm;
94using namespace llvm::PatternMatch;
95
96#define DEBUG_TYPE "aarch64-lower"
97
98STATISTIC(NumTailCalls, "Number of tail calls");
99STATISTIC(NumShiftInserts, "Number of vector shift inserts");
100STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
101
102// FIXME: The necessary dtprel relocations don't seem to be supported
103// well in the GNU bfd and gold linkers at the moment. Therefore, by
104// default, for now, fall back to GeneralDynamic code generation.
106 "aarch64-elf-ldtls-generation", cl::Hidden,
107 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
108 cl::init(false));
109
110static cl::opt<bool>
111EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
112 cl::desc("Enable AArch64 logical imm instruction "
113 "optimization"),
114 cl::init(true));
115
116/// Value type used for condition codes.
117static const MVT MVT_CC = MVT::i32;
118
119/// Returns true if VT's elements occupy the lowest bit positions of its
120/// associated register class without any intervening space.
121///
122/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
123/// same register class, but only nxv8f16 can be treated as a packed vector.
124static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
126 "Expected legal vector type!");
127 return VT.isFixedLengthVector() ||
129}
130
132 const AArch64Subtarget &STI)
133 : TargetLowering(TM), Subtarget(&STI) {
134 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
135 // we have to make something up. Arbitrarily, choose ZeroOrOne.
137 // When comparing vectors the result sets the different elements in the
138 // vector to all-one or all-zero.
140
141 // Set up the register classes.
142 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
143 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
144
145 if (Subtarget->hasFPARMv8()) {
146 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
147 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
148 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
149 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
150 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
151 }
152
153 if (Subtarget->hasNEON()) {
154 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
155 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
156 // Someone set us up the NEON.
157 addDRTypeForNEON(MVT::v2f32);
158 addDRTypeForNEON(MVT::v8i8);
159 addDRTypeForNEON(MVT::v4i16);
160 addDRTypeForNEON(MVT::v2i32);
161 addDRTypeForNEON(MVT::v1i64);
162 addDRTypeForNEON(MVT::v1f64);
163 addDRTypeForNEON(MVT::v4f16);
164 addDRTypeForNEON(MVT::v4bf16);
165
166 addQRTypeForNEON(MVT::v4f32);
167 addQRTypeForNEON(MVT::v2f64);
168 addQRTypeForNEON(MVT::v16i8);
169 addQRTypeForNEON(MVT::v8i16);
170 addQRTypeForNEON(MVT::v4i32);
171 addQRTypeForNEON(MVT::v2i64);
172 addQRTypeForNEON(MVT::v8f16);
173 addQRTypeForNEON(MVT::v8bf16);
174 }
175
176 if (Subtarget->hasSVE()) {
177 // Add legal sve predicate types
178 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
179 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
180 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
181 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
182
183 // Add legal sve data types
184 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
185 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
186 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
187 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
188
189 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
190 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
191 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
192 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
193 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
194 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
195
196 if (Subtarget->hasBF16()) {
197 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
198 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
199 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
200 }
201
202 if (useSVEForFixedLengthVectors()) {
204 if (useSVEForFixedLengthVectorVT(VT))
205 addRegisterClass(VT, &AArch64::ZPRRegClass);
206
208 if (useSVEForFixedLengthVectorVT(VT))
209 addRegisterClass(VT, &AArch64::ZPRRegClass);
210 }
211
212 for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
221 }
222
223 for (auto VT :
227
228 for (auto VT :
230 MVT::nxv2f64 }) {
240 }
241 }
242
243 // Compute derived properties from the register classes
245
246 // Provide all sorts of operation actions
280
284
288
290
291 // Custom lowering hooks are needed for XOR
292 // to fold it into CSINC/CSINV.
295
296 // Virtually no operation on f128 is legal, but LLVM can't expand them when
297 // there's a valid register class, so we need custom operations in most cases.
321
322 // Lowering for many of the conversions is actually specified by the non-f128
323 // type. The LowerXXX function will be trivial when f128 isn't involved.
352
353 // Variable arguments.
358
359 // Variable-sized objects.
362
363 if (Subtarget->isTargetWindows())
365 else
367
368 // Constant pool entries
370
371 // BlockAddress
373
374 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
383
384 // AArch64 lacks both left-rotate and popcount instructions.
390 }
391
392 // AArch64 doesn't have i32 MULH{S|U}.
395
396 // AArch64 doesn't have {U|S}MUL_LOHI.
399
403
409 }
416
417 // Custom lower Add/Sub/Mul with overflow.
430
439 if (Subtarget->hasFullFP16())
441 else
443
477
478 if (!Subtarget->hasFullFP16()) {
501
502 // promote v4f16 to v4f32 when that is known to be safe.
511
527
548 }
549
550 // AArch64 has implementations of a lot of rounding-like FP operations.
551 for (MVT Ty : {MVT::f32, MVT::f64}) {
566 }
567
568 if (Subtarget->hasFullFP16()) {
579 }
580
582
584
590
591 // 128-bit loads and stores can be done without expanding
594
595 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
596 // custom lowering, as there are no un-paired non-temporal stores and
597 // legalization will break up 256 bit inputs.
605
606 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
607 // This requires the Performance Monitors extension.
608 if (Subtarget->hasPerfMon())
610
611 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
612 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
613 // Issue __sincos_stret if available.
616 } else {
619 }
620
621 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
622 // MSVCRT doesn't have powi; fall back to pow
623 setLibcallName(RTLIB::POWI_F32, nullptr);
624 setLibcallName(RTLIB::POWI_F64, nullptr);
625 }
626
627 // Make floating-point constants legal for the large code model, so they don't
628 // become loads from the constant pool.
629 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
632 }
633
634 // AArch64 does not have floating-point extending loads, i1 sign-extending
635 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
636 for (MVT VT : MVT::fp_valuetypes()) {
641 }
642 for (MVT VT : MVT::integer_valuetypes())
644
652
656
657 // Indexed loads and stores are supported.
658 for (unsigned im = (unsigned)ISD::PRE_INC;
676 }
677
678 // Trap.
680 if (Subtarget->isTargetWindows())
682
683 // We combine OR nodes for bitfield operations.
685 // Try to create BICs for vector ANDs.
687
688 // Vector add and sub nodes may conceal a high-half opportunity.
689 // Also, try to fold ADD into CSINC/CSINV..
696
700
702
709 if (Subtarget->supportsAddressTopByteIgnored())
711
713
716
720
722
723 // In case of strict alignment, avoid an excessive number of byte wide stores.
727
732
734
738
740
742
744
745 // Set required alignment.
747 // Set preferred alignments.
750
751 // Only change the limit for entries in a jump table if specified by
752 // the sub target, but not at the command line.
753 unsigned MaxJT = STI.getMaximumJumpTableSize();
756
758
760
761 if (Subtarget->hasNEON()) {
762 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
763 // silliness like this:
789
795
797
798 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
799 // elements smaller than i32, so promote the input to i32 first.
802 // i8 vector elements also need promotion to i32 for v8i8
805 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
810 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
811 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
814
815 if (Subtarget->hasFullFP16()) {
820 } else {
821 // when AArch64 doesn't have fullfp16 support, promote the input
822 // to i32 first.
827 }
828
831
832 // AArch64 doesn't have MUL.2d:
834 // Custom handling for some quad-vector types to detect MULL.
838
839 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
841 // Vector reductions
847
848 // Saturates
853
855 }
856 for (MVT VT : { MVT::v4f16, MVT::v2f32,
860 }
861
864 // Likewise, narrowing and extending vector loads/stores aren't handled
865 // directly.
868
869 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
872 } else {
875 }
878
881
887 }
888 }
889
890 // AArch64 has implementations of a lot of rounding-like FP operations.
891 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
898 }
899
900 if (Subtarget->hasFullFP16()) {
901 for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
908 }
909 }
910
911 if (Subtarget->hasSVE())
913
915 }
916
917 if (Subtarget->hasSVE()) {
918 // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
919 // splat of 0 or undef) once vector selects supported in SVE codegen. See
920 // D68877 for more details.
922 if (isTypeLegal(VT)) {
935 if (VT.getScalarType() == MVT::i1) {
939 }
940 }
941 }
942
943 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32})
945
948
950 if (isTypeLegal(VT)) {
955 }
956 }
957
958 // NOTE: Currently this has to happen after computeRegisterProperties rather
959 // than the preferred option of combining it with the addRegisterClass call.
960 if (useSVEForFixedLengthVectors()) {
962 if (useSVEForFixedLengthVectorVT(VT))
963 addTypeForFixedLengthSVE(VT);
965 if (useSVEForFixedLengthVectorVT(VT))
966 addTypeForFixedLengthSVE(VT);
967
968 // 64bit results can mean a bigger than NEON input.
969 for (auto VT : {MVT::v8i8, MVT::v4i16})
972
973 // 128bit results imply a bigger than NEON input.
974 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
976 for (auto VT : {MVT::v8f16, MVT::v4f32})
978 }
979 }
980
982}
983
984void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
985 assert(VT.isVector() && "VT should be a vector type");
986
987 if (VT.isFloatingPoint()) {
991 }
992
993 // Mark vector float intrinsics as expand.
994 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1003
1004 // But we do support custom-lowering for FCOPYSIGN.
1006 }
1007
1019
1023 for (MVT InnerVT : MVT::all_valuetypes())
1025
1026 // CNT supports only B element sizes, then use UADDLP to widen.
1027 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1029
1035
1038
1039 if (!VT.isFloatingPoint())
1041
1042 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1043 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1044 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1045 setOperationAction(Opcode, VT, Legal);
1046
1047 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
1048 if (VT.isFloatingPoint() &&
1049 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1050 for (unsigned Opcode :
1052 setOperationAction(Opcode, VT, Legal);
1053
1054 if (Subtarget->isLittleEndian()) {
1055 for (unsigned im = (unsigned)ISD::PRE_INC;
1059 }
1060 }
1061}
1062
1063void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1064 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1065
1066 // By default everything must be expanded.
1067 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1068 setOperationAction(Op, VT, Expand);
1069
1070 // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1072
1073 // Lower fixed length vector operations to scalable equivalents.
1079}
1080
1081void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1082 addRegisterClass(VT, &AArch64::FPR64RegClass);
1083 addTypeForNEON(VT, MVT::v2i32);
1084}
1085
1086void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1087 addRegisterClass(VT, &AArch64::FPR128RegClass);
1088 addTypeForNEON(VT, MVT::v4i32);
1089}
1090
1092 LLVMContext &C, EVT VT) const {
1093 if (!VT.isVector())
1094 return MVT::i32;
1095 if (VT.isScalableVector())
1098}
1099
1100static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1101 const APInt &Demanded,
1103 unsigned NewOpc) {
1104 uint64_t OldImm = Imm, NewImm, Enc;
1105 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1106
1107 // Return if the immediate is already all zeros, all ones, a bimm32 or a
1108 // bimm64.
1109 if (Imm == 0 || Imm == Mask ||
1111 return false;
1112
1113 unsigned EltSize = Size;
1114 uint64_t DemandedBits = Demanded.getZExtValue();
1115
1116 // Clear bits that are not demanded.
1117 Imm &= DemandedBits;
1118
1119 while (true) {
1120 // The goal here is to set the non-demanded bits in a way that minimizes
1121 // the number of switching between 0 and 1. In order to achieve this goal,
1122 // we set the non-demanded bits to the value of the preceding demanded bits.
1123 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1124 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1125 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1126 // The final result is 0b11000011.
1127 uint64_t NonDemandedBits = ~DemandedBits;
1128 uint64_t InvertedImm = ~Imm & DemandedBits;
1129 uint64_t RotatedImm =
1130 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1132 uint64_t Sum = RotatedImm + NonDemandedBits;
1133 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1134 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1135 NewImm = (Imm | Ones) & Mask;
1136
1137 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1138 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1139 // we halve the element size and continue the search.
1140 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1141 break;
1142
1143 // We cannot shrink the element size any further if it is 2-bits.
1144 if (EltSize == 2)
1145 return false;
1146
1147 EltSize /= 2;
1148 Mask >>= EltSize;
1149 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1150
1151 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1152 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1153 return false;
1154
1155 // Merge the upper and lower halves of Imm and DemandedBits.
1156 Imm |= Hi;
1158 }
1159
1161
1162 // Replicate the element across the register width.
1163 while (EltSize < Size) {
1164 NewImm |= NewImm << EltSize;
1165 EltSize *= 2;
1166 }
1167
1168 (void)OldImm;
1169 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1170 "demanded bits should never be altered");
1171 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1172
1173 // Create the new constant immediate node.
1174 EVT VT = Op.getValueType();
1175 SDLoc DL(Op);
1176 SDValue New;
1177
1178 // If the new constant immediate is all-zeros or all-ones, let the target
1179 // independent DAG combine optimize this node.
1180 if (NewImm == 0 || NewImm == OrigMask) {
1181 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1182 TLO.DAG.getConstant(NewImm, DL, VT));
1183 // Otherwise, create a machine node so that target independent DAG combine
1184 // doesn't undo this optimization.
1185 } else {
1187 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1188 New = SDValue(
1189 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1190 }
1191
1192 return TLO.CombineTo(Op, New);
1193}
1194
1196 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1197 TargetLoweringOpt &TLO) const {
1198 // Delay this optimization to as late as possible.
1199 if (!TLO.LegalOps)
1200 return false;
1201
1203 return false;
1204
1205 EVT VT = Op.getValueType();
1206 if (VT.isVector())
1207 return false;
1208
1209 unsigned Size = VT.getSizeInBits();
1210 assert((Size == 32 || Size == 64) &&
1211 "i32 or i64 is expected after legalization.");
1212
1213 // Exit early if we demand all bits.
1214 if (DemandedBits.countPopulation() == Size)
1215 return false;
1216
1217 unsigned NewOpc;
1218 switch (Op.getOpcode()) {
1219 default:
1220 return false;
1221 case ISD::AND:
1222 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1223 break;
1224 case ISD::OR:
1225 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1226 break;
1227 case ISD::XOR:
1228 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1229 break;
1230 }
1231 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1232 if (!C)
1233 return false;
1234 uint64_t Imm = C->getZExtValue();
1235 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1236}
1237
1238/// computeKnownBitsForTargetNode - Determine which of the bits specified in
1239/// Mask are known to be either zero or one and return them Known.
1241 const SDValue Op, KnownBits &Known,
1242 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1243 switch (Op.getOpcode()) {
1244 default:
1245 break;
1246 case AArch64ISD::CSEL: {
1248 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1249 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1250 Known.Zero &= Known2.Zero;
1251 Known.One &= Known2.One;
1252 break;
1253 }
1255 case AArch64ISD::ADDlow: {
1256 if (!Subtarget->isTargetILP32())
1257 break;
1258 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1259 Known.Zero = APInt::getHighBitsSet(64, 32);
1260 break;
1261 }
1263 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1264 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1265 switch (IntID) {
1266 default: return;
1267 case Intrinsic::aarch64_ldaxr:
1268 case Intrinsic::aarch64_ldxr: {
1269 unsigned BitWidth = Known.getBitWidth();
1270 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1271 unsigned MemBits = VT.getScalarSizeInBits();
1273 return;
1274 }
1275 }
1276 break;
1277 }
1279 case ISD::INTRINSIC_VOID: {
1280 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1281 switch (IntNo) {
1282 default:
1283 break;
1284 case Intrinsic::aarch64_neon_umaxv:
1285 case Intrinsic::aarch64_neon_uminv: {
1286 // Figure out the datatype of the vector operand. The UMINV instruction
1287 // will zero extend the result, so we can mark as known zero all the
1288 // bits larger than the element datatype. 32-bit or larget doesn't need
1289 // this as those are legal types and will be handled by isel directly.
1290 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1291 unsigned BitWidth = Known.getBitWidth();
1292 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1293 assert(BitWidth >= 8 && "Unexpected width!");
1295 Known.Zero |= Mask;
1296 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1297 assert(BitWidth >= 16 && "Unexpected width!");
1299 Known.Zero |= Mask;
1300 }
1301 break;
1302 } break;
1303 }
1304 }
1305 }
1306}
1307
1312
1314 EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1315 bool *Fast) const {
1316 if (Subtarget->requiresStrictAlign())
1317 return false;
1318
1319 if (Fast) {
1320 // Some CPUs are fine with unaligned stores except for 128-bit ones.
1321 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1322 // See comments in performSTORECombine() for more details about
1323 // these conditions.
1324
1325 // Code that uses clang vector extensions can mark that it
1326 // wants unaligned accesses to be treated as fast by
1327 // underspecifying alignment to be 1 or 2.
1328 Align <= 2 ||
1329
1330 // Disregard v2i64. Memcpy lowering produces those and splitting
1331 // them regresses performance on micro-benchmarks and olden/bh.
1332 VT == MVT::v2i64;
1333 }
1334 return true;
1335}
1336
1337// Same as above but handling LLTs instead.
1339 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1340 bool *Fast) const {
1341 if (Subtarget->requiresStrictAlign())
1342 return false;
1343
1344 if (Fast) {
1345 // Some CPUs are fine with unaligned stores except for 128-bit ones.
1346 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1347 Ty.getSizeInBytes() != 16 ||
1348 // See comments in performSTORECombine() for more details about
1349 // these conditions.
1350
1351 // Code that uses clang vector extensions can mark that it
1352 // wants unaligned accesses to be treated as fast by
1353 // underspecifying alignment to be 1 or 2.
1354 Alignment <= 2 ||
1355
1356 // Disregard v2i64. Memcpy lowering produces those and splitting
1357 // them regresses performance on micro-benchmarks and olden/bh.
1358 Ty == LLT::vector(2, 64);
1359 }
1360 return true;
1361}
1362
1363FastISel *
1368
1369const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1370#define MAKE_CASE(V) \
1371 case V: \
1372 return #V;
1373 switch ((AArch64ISD::NodeType)Opcode) {
1375 break;
1608 }
1609#undef MAKE_CASE
1610 return nullptr;
1611}
1612
1615 MachineBasicBlock *MBB) const {
1616 // We materialise the F128CSEL pseudo-instruction as some control flow and a
1617 // phi node:
1618
1619 // OrigBB:
1620 // [... previous instrs leading to comparison ...]
1621 // b.ne TrueBB
1622 // b EndBB
1623 // TrueBB:
1624 // ; Fallthrough
1625 // EndBB:
1626 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1627
1628 MachineFunction *MF = MBB->getParent();
1629 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1630 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1631 DebugLoc DL = MI.getDebugLoc();
1633
1634 Register DestReg = MI.getOperand(0).getReg();
1635 Register IfTrueReg = MI.getOperand(1).getReg();
1636 Register IfFalseReg = MI.getOperand(2).getReg();
1637 unsigned CondCode = MI.getOperand(3).getImm();
1638 bool NZCVKilled = MI.getOperand(4).isKill();
1639
1642 MF->insert(It, TrueBB);
1643 MF->insert(It, EndBB);
1644
1645 // Transfer rest of current basic-block to EndBB
1646 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1647 MBB->end());
1648 EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1649
1650 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1651 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1652 MBB->addSuccessor(TrueBB);
1654
1655 // TrueBB falls through to the end.
1656 TrueBB->addSuccessor(EndBB);
1657
1658 if (!NZCVKilled) {
1659 TrueBB->addLiveIn(AArch64::NZCV);
1660 EndBB->addLiveIn(AArch64::NZCV);
1661 }
1662
1663 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1665 .addMBB(TrueBB)
1667 .addMBB(MBB);
1668
1669 MI.eraseFromParent();
1670 return EndBB;
1671}
1672
1680
1682 MachineInstr &MI, MachineBasicBlock *BB) const {
1683 switch (MI.getOpcode()) {
1684 default:
1685#ifndef NDEBUG
1686 MI.dump();
1687#endif
1688 llvm_unreachable("Unexpected instruction for custom inserter!");
1689
1690 case AArch64::F128CSEL:
1691 return EmitF128CSEL(MI, BB);
1692
1693 case TargetOpcode::STACKMAP:
1694 case TargetOpcode::PATCHPOINT:
1695 return emitPatchPoint(MI, BB);
1696
1697 case AArch64::CATCHRET:
1698 return EmitLoweredCatchRet(MI, BB);
1699 }
1700}
1701
1702//===----------------------------------------------------------------------===//
1703// AArch64 Lowering private implementation.
1704//===----------------------------------------------------------------------===//
1705
1706//===----------------------------------------------------------------------===//
1707// Lowering Code
1708//===----------------------------------------------------------------------===//
1709
1710/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1711/// CC
1713 switch (CC) {
1714 default:
1715 llvm_unreachable("Unknown condition code!");
1716 case ISD::SETNE:
1717 return AArch64CC::NE;
1718 case ISD::SETEQ:
1719 return AArch64CC::EQ;
1720 case ISD::SETGT:
1721 return AArch64CC::GT;
1722 case ISD::SETGE:
1723 return AArch64CC::GE;
1724 case ISD::SETLT:
1725 return AArch64CC::LT;
1726 case ISD::SETLE:
1727 return AArch64CC::LE;
1728 case ISD::SETUGT:
1729 return AArch64CC::HI;
1730 case ISD::SETUGE:
1731 return AArch64CC::HS;
1732 case ISD::SETULT:
1733 return AArch64CC::LO;
1734 case ISD::SETULE:
1735 return AArch64CC::LS;
1736 }
1737}
1738
1739/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1741 AArch64CC::CondCode &CondCode,
1744 switch (CC) {
1745 default:
1746 llvm_unreachable("Unknown FP condition!");
1747 case ISD::SETEQ:
1748 case ISD::SETOEQ:
1749 CondCode = AArch64CC::EQ;
1750 break;
1751 case ISD::SETGT:
1752 case ISD::SETOGT:
1753 CondCode = AArch64CC::GT;
1754 break;
1755 case ISD::SETGE:
1756 case ISD::SETOGE:
1757 CondCode = AArch64CC::GE;
1758 break;
1759 case ISD::SETOLT:
1760 CondCode = AArch64CC::MI;
1761 break;
1762 case ISD::SETOLE:
1763 CondCode = AArch64CC::LS;
1764 break;
1765 case ISD::SETONE:
1766 CondCode = AArch64CC::MI;
1768 break;
1769 case ISD::SETO:
1770 CondCode = AArch64CC::VC;
1771 break;
1772 case ISD::SETUO:
1773 CondCode = AArch64CC::VS;
1774 break;
1775 case ISD::SETUEQ:
1776 CondCode = AArch64CC::EQ;
1778 break;
1779 case ISD::SETUGT:
1780 CondCode = AArch64CC::HI;
1781 break;
1782 case ISD::SETUGE:
1783 CondCode = AArch64CC::PL;
1784 break;
1785 case ISD::SETLT:
1786 case ISD::SETULT:
1787 CondCode = AArch64CC::LT;
1788 break;
1789 case ISD::SETLE:
1790 case ISD::SETULE:
1791 CondCode = AArch64CC::LE;
1792 break;
1793 case ISD::SETNE:
1794 case ISD::SETUNE:
1795 CondCode = AArch64CC::NE;
1796 break;
1797 }
1798}
1799
1800/// Convert a DAG fp condition code to an AArch64 CC.
1801/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1802/// should be AND'ed instead of OR'ed.
1804 AArch64CC::CondCode &CondCode,
1807 switch (CC) {
1808 default:
1809 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1811 break;
1812 case ISD::SETONE:
1813 // (a one b)
1814 // == ((a olt b) || (a ogt b))
1815 // == ((a ord b) && (a une b))
1816 CondCode = AArch64CC::VC;
1818 break;
1819 case ISD::SETUEQ:
1820 // (a ueq b)
1821 // == ((a uno b) || (a oeq b))
1822 // == ((a ule b) && (a uge b))
1823 CondCode = AArch64CC::PL;
1825 break;
1826 }
1827}
1828
1829/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1830/// CC usable with the vector instructions. Fewer operations are available
1831/// without a real NZCV register, so we have to use less efficient combinations
1832/// to get the same effect.
1834 AArch64CC::CondCode &CondCode,
1836 bool &Invert) {
1837 Invert = false;
1838 switch (CC) {
1839 default:
1840 // Mostly the scalar mappings work fine.
1841 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1842 break;
1843 case ISD::SETUO:
1844 Invert = true;
1846 case ISD::SETO:
1847 CondCode = AArch64CC::MI;
1849 break;
1850 case ISD::SETUEQ:
1851 case ISD::SETULT:
1852 case ISD::SETULE:
1853 case ISD::SETUGT:
1854 case ISD::SETUGE:
1855 // All of the compare-mask comparisons are ordered, but we can switch
1856 // between the two by a double inversion. E.g. ULE == !OGT.
1857 Invert = true;
1858 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
1859 CondCode, CondCode2);
1860 break;
1861 }
1862}
1863
1864static bool isLegalArithImmed(uint64_t C) {
1865 // Matches AArch64DAGToDAGISel::SelectArithImmed().
1866 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1867 LLVM_DEBUG(dbgs() << "Is imm " << C
1868 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1869 return IsLegal;
1870}
1871
1872// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
1873// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
1874// can be set differently by this operation. It comes down to whether
1875// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1876// everything is fine. If not then the optimization is wrong. Thus general
1877// comparisons are only valid if op2 != 0.
1878//
1879// So, finally, the only LLVM-native comparisons that don't mention C and V
1880// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1881// the absence of information about op2.
1882static bool isCMN(SDValue Op, ISD::CondCode CC) {
1883 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
1884 (CC == ISD::SETEQ || CC == ISD::SETNE);
1885}
1886
1888 SelectionDAG &DAG, SDValue Chain,
1889 bool IsSignaling) {
1890 EVT VT = LHS.getValueType();
1891 assert(VT != MVT::f128);
1892 assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
1893 unsigned Opcode =
1895 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
1896}
1897
1899 const SDLoc &dl, SelectionDAG &DAG) {
1900 EVT VT = LHS.getValueType();
1901 const bool FullFP16 =
1902 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1903
1904 if (VT.isFloatingPoint()) {
1905 assert(VT != MVT::f128);
1906 if (VT == MVT::f16 && !FullFP16) {
1907 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1908 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1909 VT = MVT::f32;
1910 }
1911 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1912 }
1913
1914 // The CMP instruction is just an alias for SUBS, and representing it as
1915 // SUBS means that it's possible to get CSE with subtract operations.
1916 // A later phase can perform the optimization of setting the destination
1917 // register to WZR/XZR if it ends up being unused.
1918 unsigned Opcode = AArch64ISD::SUBS;
1919
1920 if (isCMN(RHS, CC)) {
1921 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
1922 Opcode = AArch64ISD::ADDS;
1923 RHS = RHS.getOperand(1);
1924 } else if (isCMN(LHS, CC)) {
1925 // As we are looking for EQ/NE compares, the operands can be commuted ; can
1926 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
1927 Opcode = AArch64ISD::ADDS;
1928 LHS = LHS.getOperand(1);
1929 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
1930 if (LHS.getOpcode() == ISD::AND) {
1931 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1932 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1933 // of the signed comparisons.
1934 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
1935 DAG.getVTList(VT, MVT_CC),
1936 LHS.getOperand(0),
1937 LHS.getOperand(1));
1938 // Replace all users of (and X, Y) with newly generated (ands X, Y)
1939 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
1940 return ANDSNode.getValue(1);
1941 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
1942 // Use result of ANDS
1943 return LHS.getValue(1);
1944 }
1945 }
1946
1947 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1948 .getValue(1);
1949}
1950
1951/// \defgroup AArch64CCMP CMP;CCMP matching
1952///
1953/// These functions deal with the formation of CMP;CCMP;... sequences.
1954/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1955/// a comparison. They set the NZCV flags to a predefined value if their
1956/// predicate is false. This allows to express arbitrary conjunctions, for
1957/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
1958/// expressed as:
1959/// cmp A
1960/// ccmp B, inv(CB), CA
1961/// check for CB flags
1962///
1963/// This naturally lets us implement chains of AND operations with SETCC
1964/// operands. And we can even implement some other situations by transforming
1965/// them:
1966/// - We can implement (NEG SETCC) i.e. negating a single comparison by
1967/// negating the flags used in a CCMP/FCCMP operations.
1968/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
1969/// by negating the flags we test for afterwards. i.e.
1970/// NEG (CMP CCMP CCCMP ...) can be implemented.
1971/// - Note that we can only ever negate all previously processed results.
1972/// What we can not implement by flipping the flags to test is a negation
1973/// of two sub-trees (because the negation affects all sub-trees emitted so
1974/// far, so the 2nd sub-tree we emit would also affect the first).
1975/// With those tools we can implement some OR operations:
1976/// - (OR (SETCC A) (SETCC B)) can be implemented via:
1977/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
1978/// - After transforming OR to NEG/AND combinations we may be able to use NEG
1979/// elimination rules from earlier to implement the whole thing as a
1980/// CCMP/FCCMP chain.
1981///
1982/// As complete example:
1983/// or (or (setCA (cmp A)) (setCB (cmp B)))
1984/// (and (setCC (cmp C)) (setCD (cmp D)))"
1985/// can be reassociated to:
1986/// or (and (setCC (cmp C)) setCD (cmp D))
1987// (or (setCA (cmp A)) (setCB (cmp B)))
1988/// can be transformed to:
1989/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
1990/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1991/// which can be implemented as:
1992/// cmp C
1993/// ccmp D, inv(CD), CC
1994/// ccmp A, CA, inv(CD)
1995/// ccmp B, CB, inv(CA)
1996/// check for CB flags
1997///
1998/// A counterexample is "or (and A B) (and C D)" which translates to
1999/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2000/// can only implement 1 of the inner (not) operations, but not both!
2001/// @{
2002
2003/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2005 ISD::CondCode CC, SDValue CCOp,
2006 AArch64CC::CondCode Predicate,
2008 const SDLoc &DL, SelectionDAG &DAG) {
2009 unsigned Opcode = 0;
2010 const bool FullFP16 =
2011 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2012
2013 if (LHS.getValueType().isFloatingPoint()) {
2014 assert(LHS.getValueType() != MVT::f128);
2015 if (LHS.getValueType() == MVT::f16 && !FullFP16) {
2016 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
2017 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
2018 }
2019 Opcode = AArch64ISD::FCCMP;
2020 } else if (RHS.getOpcode() == ISD::SUB) {
2021 SDValue SubOp0 = RHS.getOperand(0);
2022 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
2023 // See emitComparison() on why we can only do this for SETEQ and SETNE.
2024 Opcode = AArch64ISD::CCMN;
2025 RHS = RHS.getOperand(1);
2026 }
2027 }
2028 if (Opcode == 0)
2029 Opcode = AArch64ISD::CCMP;
2030
2031 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
2035 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
2036}
2037
2038/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
2039/// expressed as a conjunction. See \ref AArch64CCMP.
2040/// \param CanNegate Set to true if we can negate the whole sub-tree just by
2041/// changing the conditions on the SETCC tests.
2042/// (this means we can call emitConjunctionRec() with
2043/// Negate==true on this sub-tree)
2044/// \param MustBeFirst Set to true if this subtree needs to be negated and we
2045/// cannot do the negation naturally. We are required to
2046/// emit the subtree first in this case.
2047/// \param WillNegate Is true if are called when the result of this
2048/// subexpression must be negated. This happens when the
2049/// outer expression is an OR. We can use this fact to know
2050/// that we have a double negation (or (or ...) ...) that
2051/// can be implemented for free.
2052static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
2053 bool &MustBeFirst, bool WillNegate,
2054 unsigned Depth = 0) {
2055 if (!Val.hasOneUse())
2056 return false;
2057 unsigned Opcode = Val->getOpcode();
2058 if (Opcode == ISD::SETCC) {
2059 if (Val->getOperand(0).getValueType() == MVT::f128)
2060 return false;
2061 CanNegate = true;
2062 MustBeFirst = false;
2063 return true;
2064 }
2065 // Protect against exponential runtime and stack overflow.
2066 if (Depth > 6)
2067 return false;
2068 if (Opcode == ISD::AND || Opcode == ISD::OR) {
2069 bool IsOR = Opcode == ISD::OR;
2070 SDValue O0 = Val->getOperand(0);
2071 SDValue O1 = Val->getOperand(1);
2072 bool CanNegateL;
2073 bool MustBeFirstL;
2075 return false;
2076 bool CanNegateR;
2077 bool MustBeFirstR;
2079 return false;
2080
2082 return false;
2083
2084 if (IsOR) {
2085 // For an OR expression we need to be able to naturally negate at least
2086 // one side or we cannot do the transformation at all.
2087 if (!CanNegateL && !CanNegateR)
2088 return false;
2089 // If we the result of the OR will be negated and we can naturally negate
2090 // the leafs, then this sub-tree as a whole negates naturally.
2092 // If we cannot naturally negate the whole sub-tree, then this must be
2093 // emitted first.
2095 } else {
2096 assert(Opcode == ISD::AND && "Must be OR or AND");
2097 // We cannot naturally negate an AND operation.
2098 CanNegate = false;
2100 }
2101 return true;
2102 }
2103 return false;
2104}
2105
2106/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
2107/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
2108/// Tries to transform the given i1 producing node @p Val to a series compare
2109/// and conditional compare operations. @returns an NZCV flags producing node
2110/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
2111/// transformation was not possible.
2112/// \p Negate is true if we want this sub-tree being negated just by changing
2113/// SETCC conditions.
2115 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
2116 AArch64CC::CondCode Predicate) {
2117 // We're at a tree leaf, produce a conditional comparison operation.
2118 unsigned Opcode = Val->getOpcode();
2119 if (Opcode == ISD::SETCC) {
2120 SDValue LHS = Val->getOperand(0);
2121 SDValue RHS = Val->getOperand(1);
2122 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
2123 bool isInteger = LHS.getValueType().isInteger();
2124 if (Negate)
2125 CC = getSetCCInverse(CC, LHS.getValueType());
2126 SDLoc DL(Val);
2127 // Determine OutCC and handle FP special case.
2128 if (isInteger) {
2130 } else {
2134 // Some floating point conditions can't be tested with a single condition
2135 // code. Construct an additional comparison in this case.
2136 if (ExtraCC != AArch64CC::AL) {
2138 if (!CCOp.getNode())
2139 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
2140 else
2141 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
2142 ExtraCC, DL, DAG);
2143 CCOp = ExtraCmp;
2144 Predicate = ExtraCC;
2145 }
2146 }
2147
2148 // Produce a normal comparison if we are first in the chain
2149 if (!CCOp)
2150 return emitComparison(LHS, RHS, CC, DL, DAG);
2151 // Otherwise produce a ccmp.
2152 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
2153 DAG);
2154 }
2155 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
2156
2157 bool IsOR = Opcode == ISD::OR;
2158
2159 SDValue LHS = Val->getOperand(0);
2160 bool CanNegateL;
2161 bool MustBeFirstL;
2163 assert(ValidL && "Valid conjunction/disjunction tree");
2164 (void)ValidL;
2165
2166 SDValue RHS = Val->getOperand(1);
2167 bool CanNegateR;
2168 bool MustBeFirstR;
2170 assert(ValidR && "Valid conjunction/disjunction tree");
2171 (void)ValidR;
2172
2173 // Swap sub-tree that must come first to the right side.
2174 if (MustBeFirstL) {
2175 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
2176 std::swap(LHS, RHS);
2179 }
2180
2181 bool NegateR;
2182 bool NegateAfterR;
2183 bool NegateL;
2184 bool NegateAfterAll;
2185 if (Opcode == ISD::OR) {
2186 // Swap the sub-tree that we can negate naturally to the left.
2187 if (!CanNegateL) {
2188 assert(CanNegateR && "at least one side must be negatable");
2189 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
2190 assert(!Negate);
2191 std::swap(LHS, RHS);
2192 NegateR = false;
2193 NegateAfterR = true;
2194 } else {
2195 // Negate the left sub-tree if possible, otherwise negate the result.
2198 }
2199 NegateL = true;
2200 NegateAfterAll = !Negate;
2201 } else {
2202 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
2203 assert(!Negate && "Valid conjunction/disjunction tree");
2204
2205 NegateL = false;
2206 NegateR = false;
2207 NegateAfterR = false;
2208 NegateAfterAll = false;
2209 }
2210
2211 // Emit sub-trees.
2213 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
2214 if (NegateAfterR)
2217 if (NegateAfterAll)
2219 return CmpL;
2220}
2221
2222/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
2223/// In some cases this is even possible with OR operations in the expression.
2224/// See \ref AArch64CCMP.
2225/// \see emitConjunctionRec().
2228 bool DummyCanNegate;
2229 bool DummyMustBeFirst;
2231 return SDValue();
2232
2233 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
2234}
2235
2236/// @}
2237
2238/// Returns how profitable it is to fold a comparison's operand's shift and/or
2239/// extension operations.
2241 auto isSupportedExtend = [&](SDValue V) {
2242 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
2243 return true;
2244
2245 if (V.getOpcode() == ISD::AND)
2246 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
2247 uint64_t Mask = MaskCst->getZExtValue();
2248 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
2249 }
2250
2251 return false;
2252 };
2253
2254 if (!Op.hasOneUse())
2255 return 0;
2256
2257 if (isSupportedExtend(Op))
2258 return 1;
2259
2260 unsigned Opc = Op.getOpcode();
2261 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
2262 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
2263 uint64_t Shift = ShiftCst->getZExtValue();
2264 if (isSupportedExtend(Op.getOperand(0)))
2265 return (Shift <= 4) ? 2 : 1;
2266 EVT VT = Op.getValueType();
2267 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
2268 return 1;
2269 }
2270
2271 return 0;
2272}
2273
2276 const SDLoc &dl) {
2278 EVT VT = RHS.getValueType();
2279 uint64_t C = RHSC->getZExtValue();
2280 if (!isLegalArithImmed(C)) {
2281 // Constant does not fit, try adjusting it by one?
2282 switch (CC) {
2283 default:
2284 break;
2285 case ISD::SETLT:
2286 case ISD::SETGE:
2287 if ((VT == MVT::i32 && C != 0x80000000 &&
2288 isLegalArithImmed((uint32_t)(C - 1))) ||
2289 (VT == MVT::i64 && C != 0x80000000ULL &&
2290 isLegalArithImmed(C - 1ULL))) {
2291 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2292 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2293 RHS = DAG.getConstant(C, dl, VT);
2294 }
2295 break;
2296 case ISD::SETULT:
2297 case ISD::SETUGE:
2298 if ((VT == MVT::i32 && C != 0 &&
2299 isLegalArithImmed((uint32_t)(C - 1))) ||
2300 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
2301 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2302 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2303 RHS = DAG.getConstant(C, dl, VT);
2304 }
2305 break;
2306 case ISD::SETLE:
2307 case ISD::SETGT:
2308 if ((VT == MVT::i32 && C != INT32_MAX &&
2309 isLegalArithImmed((uint32_t)(C + 1))) ||
2310 (VT == MVT::i64 && C != INT64_MAX &&
2311 isLegalArithImmed(C + 1ULL))) {
2312 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2313 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2314 RHS = DAG.getConstant(C, dl, VT);
2315 }
2316 break;
2317 case ISD::SETULE:
2318 case ISD::SETUGT:
2319 if ((VT == MVT::i32 && C != UINT32_MAX &&
2320 isLegalArithImmed((uint32_t)(C + 1))) ||
2321 (VT == MVT::i64 && C != UINT64_MAX &&
2322 isLegalArithImmed(C + 1ULL))) {
2323 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2324 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2325 RHS = DAG.getConstant(C, dl, VT);
2326 }
2327 break;
2328 }
2329 }
2330 }
2331
2332 // Comparisons are canonicalized so that the RHS operand is simpler than the
2333 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
2334 // can fold some shift+extend operations on the RHS operand, so swap the
2335 // operands if that can be done.
2336 //
2337 // For example:
2338 // lsl w13, w11, #1
2339 // cmp w13, w12
2340 // can be turned into:
2341 // cmp w12, w11, lsl #1
2342 if (!isa<ConstantSDNode>(RHS) ||
2343 !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
2344 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
2345
2347 std::swap(LHS, RHS);
2349 }
2350 }
2351
2352 SDValue Cmp;
2353 AArch64CC::CondCode AArch64CC;
2354 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
2356
2357 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
2358 // For the i8 operand, the largest immediate is 255, so this can be easily
2359 // encoded in the compare instruction. For the i16 operand, however, the
2360 // largest immediate cannot be encoded in the compare.
2361 // Therefore, use a sign extending load and cmn to avoid materializing the
2362 // -1 constant. For example,
2363 // movz w1, #65535
2364 // ldrh w0, [x0, #0]
2365 // cmp w0, w1
2366 // >
2367 // ldrsh w0, [x0, #0]
2368 // cmn w0, #1
2369 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
2370 // if and only if (sext LHS) == (sext RHS). The checks are in place to
2371 // ensure both the LHS and RHS are truly zero extended and to make sure the
2372 // transformation is profitable.
2373 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
2374 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
2375 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
2376 LHS.getNode()->hasNUsesOfValue(1, 0)) {
2377 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
2379 SDValue SExt =
2380 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2381 DAG.getValueType(MVT::i16));
2382 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2383 RHS.getValueType()),
2384 CC, dl, DAG);
2385 AArch64CC = changeIntCCToAArch64CC(CC);
2386 }
2387 }
2388
2389 if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2390 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2391 if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2392 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2393 }
2394 }
2395 }
2396
2397 if (!Cmp) {
2398 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2399 AArch64CC = changeIntCCToAArch64CC(CC);
2400 }
2401 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2402 return Cmp;
2403}
2404
2405static std::pair<SDValue, SDValue>
2407 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2408 "Unsupported value type");
2410 SDLoc DL(Op);
2411 SDValue LHS = Op.getOperand(0);
2412 SDValue RHS = Op.getOperand(1);
2413 unsigned Opc = 0;
2414 switch (Op.getOpcode()) {
2415 default:
2416 llvm_unreachable("Unknown overflow instruction!");
2417 case ISD::SADDO:
2418 Opc = AArch64ISD::ADDS;
2419 CC = AArch64CC::VS;
2420 break;
2421 case ISD::UADDO:
2422 Opc = AArch64ISD::ADDS;
2423 CC = AArch64CC::HS;
2424 break;
2425 case ISD::SSUBO:
2426 Opc = AArch64ISD::SUBS;
2427 CC = AArch64CC::VS;
2428 break;
2429 case ISD::USUBO:
2430 Opc = AArch64ISD::SUBS;
2431 CC = AArch64CC::LO;
2432 break;
2433 // Multiply needs a little bit extra work.
2434 case ISD::SMULO:
2435 case ISD::UMULO: {
2436 CC = AArch64CC::NE;
2437 bool IsSigned = Op.getOpcode() == ISD::SMULO;
2438 if (Op.getValueType() == MVT::i32) {
2439 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2440 // For a 32 bit multiply with overflow check we want the instruction
2441 // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2442 // need to generate the following pattern:
2443 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2444 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2445 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2446 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2447 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
2448 DAG.getConstant(0, DL, MVT::i64));
2449 // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2450 // operation. We need to clear out the upper 32 bits, because we used a
2451 // widening multiply that wrote all 64 bits. In the end this should be a
2452 // noop.
2453 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
2454 if (IsSigned) {
2455 // The signed overflow check requires more than just a simple check for
2456 // any bit set in the upper 32 bits of the result. These bits could be
2457 // just the sign bits of a negative number. To perform the overflow
2458 // check we have to arithmetic shift right the 32nd bit of the result by
2459 // 31 bits. Then we compare the result to the upper 32 bits.
2461 DAG.getConstant(32, DL, MVT::i64));
2464 DAG.getConstant(31, DL, MVT::i64));
2465 // It is important that LowerBits is last, otherwise the arithmetic
2466 // shift will not be folded into the compare (SUBS).
2467 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2469 .getValue(1);
2470 } else {
2471 // The overflow check for unsigned multiply is easy. We only need to
2472 // check if any of the upper 32 bits are set. This can be done with a
2473 // CMP (shifted register). For that we need to generate the following
2474 // pattern:
2475 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2477 DAG.getConstant(32, DL, MVT::i64));
2478 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2479 Overflow =
2480 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2481 DAG.getConstant(0, DL, MVT::i64),
2482 UpperBits).getValue(1);
2483 }
2484 break;
2485 }
2486 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2487 // For the 64 bit multiply
2488 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2489 if (IsSigned) {
2490 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2492 DAG.getConstant(63, DL, MVT::i64));
2493 // It is important that LowerBits is last, otherwise the arithmetic
2494 // shift will not be folded into the compare (SUBS).
2495 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2497 .getValue(1);
2498 } else {
2499 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2500 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2501 Overflow =
2502 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2503 DAG.getConstant(0, DL, MVT::i64),
2504 UpperBits).getValue(1);
2505 }
2506 break;
2507 }
2508 } // switch (...)
2509
2510 if (Opc) {
2511 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2512
2513 // Emit the AArch64 operation with overflow check.
2514 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2515 Overflow = Value.getValue(1);
2516 }
2517 return std::make_pair(Value, Overflow);
2518}
2519
2520SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
2521 RTLIB::Libcall Call) const {
2522 bool IsStrict = Op->isStrictFPOpcode();
2523 unsigned Offset = IsStrict ? 1 : 0;
2524 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
2525 SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
2526 MakeLibCallOptions CallOptions;
2528 SDLoc dl(Op);
2529 std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops,
2530 CallOptions, dl, Chain);
2531 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
2532}
2533
2535 SDValue Sel = Op.getOperand(0);
2536 SDValue Other = Op.getOperand(1);
2537 SDLoc dl(Sel);
2538
2539 // If the operand is an overflow checking operation, invert the condition
2540 // code and kill the Not operation. I.e., transform:
2541 // (xor (overflow_op_bool, 1))
2542 // -->
2543 // (csel 1, 0, invert(cc), overflow_op_bool)
2544 // ... which later gets transformed to just a cset instruction with an
2545 // inverted condition code, rather than a cset + eor sequence.
2547 // Only lower legal XALUO ops.
2548 if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2549 return SDValue();
2550
2551 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2552 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2555 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2556 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2557 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2558 CCVal, Overflow);
2559 }
2560 // If neither operand is a SELECT_CC, give up.
2561 if (Sel.getOpcode() != ISD::SELECT_CC)
2563 if (Sel.getOpcode() != ISD::SELECT_CC)
2564 return Op;
2565
2566 // The folding we want to perform is:
2567 // (xor x, (select_cc a, b, cc, 0, -1) )
2568 // -->
2569 // (csel x, (xor x, -1), cc ...)
2570 //
2571 // The latter will get matched to a CSINV instruction.
2572
2573 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2574 SDValue LHS = Sel.getOperand(0);
2575 SDValue RHS = Sel.getOperand(1);
2576 SDValue TVal = Sel.getOperand(2);
2577 SDValue FVal = Sel.getOperand(3);
2578
2579 // FIXME: This could be generalized to non-integer comparisons.
2580 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2581 return Op;
2582
2585
2586 // The values aren't constants, this isn't the pattern we're looking for.
2587 if (!CFVal || !CTVal)
2588 return Op;
2589
2590 // We can commute the SELECT_CC by inverting the condition. This
2591 // might be needed to make this fit into a CSINV pattern.
2592 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2595 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
2596 }
2597
2598 // If the constants line up, perform the transform!
2599 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2600 SDValue CCVal;
2601 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2602
2603 FVal = Other;
2604 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2605 DAG.getConstant(-1ULL, dl, Other.getValueType()));
2606
2607 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2608 CCVal, Cmp);
2609 }
2610
2611 return Op;
2612}
2613
2615 EVT VT = Op.getValueType();
2616
2617 // Let legalize expand this if it isn't a legal type yet.
2618 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2619 return SDValue();
2620
2621 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2622
2623 unsigned Opc;
2624 bool ExtraOp = false;
2625 switch (Op.getOpcode()) {
2626 default:
2627 llvm_unreachable("Invalid code");
2628 case ISD::ADDC:
2629 Opc = AArch64ISD::ADDS;
2630 break;
2631 case ISD::SUBC:
2632 Opc = AArch64ISD::SUBS;
2633 break;
2634 case ISD::ADDE:
2635 Opc = AArch64ISD::ADCS;
2636 ExtraOp = true;
2637 break;
2638 case ISD::SUBE:
2639 Opc = AArch64ISD::SBCS;
2640 ExtraOp = true;
2641 break;
2642 }
2643
2644 if (!ExtraOp)
2645 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2646 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2647 Op.getOperand(2));
2648}
2649
2651 // Let legalize expand this if it isn't a legal type yet.
2652 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
2653 return SDValue();
2654
2655 SDLoc dl(Op);
2657 // The actual operation that sets the overflow or carry flag.
2659 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2660
2661 // We use 0 and 1 as false and true values.
2662 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2663 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2664
2665 // We use an inverted condition, because the conditional select is inverted
2666 // too. This will allow it to be selected to a single instruction:
2667 // CSINC Wd, WZR, WZR, invert(cond).
2668 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2670 CCVal, Overflow);
2671
2672 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2673 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2674}
2675
2676// Prefetch operands are:
2677// 1: Address to prefetch
2678// 2: bool isWrite
2679// 3: int locality (0 = no locality ... 3 = extreme locality)
2680// 4: bool isDataCache
2682 SDLoc DL(Op);
2683 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2684 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2685 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2686
2687 bool IsStream = !Locality;
2688 // When the locality number is set
2689 if (Locality) {
2690 // The front-end should have filtered out the out-of-range values
2691 assert(Locality <= 3 && "Prefetch locality out-of-range");
2692 // The locality degree is the opposite of the cache speed.
2693 // Put the number the other way around.
2694 // The encoding starts at 0 for level 1
2695 Locality = 3 - Locality;
2696 }
2697
2698 // built the mask value encoding the expected behavior.
2699 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2700 (!IsData << 3) | // IsDataCache bit
2701 (Locality << 1) | // Cache level bits
2702 (unsigned)IsStream; // Stream bit
2703 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2704 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2705}
2706
2707SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2708 SelectionDAG &DAG) const {
2709 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2710
2712 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
2713
2714 return LowerF128Call(Op, DAG, LC);
2715}
2716
2717SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2718 SelectionDAG &DAG) const {
2719 bool IsStrict = Op->isStrictFPOpcode();
2720 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
2721 EVT SrcVT = SrcVal.getValueType();
2722
2723 if (SrcVT != MVT::f128) {
2724 // Expand cases where the input is a vector bigger than NEON.
2725 if (useSVEForFixedLengthVectorVT(SrcVT))
2726 return SDValue();
2727
2728 // It's legal except when f128 is involved
2729 return Op;
2730 }
2731
2733 LC = RTLIB::getFPROUND(SrcVT, Op.getValueType());
2734
2735 // FP_ROUND node has a second operand indicating whether it is known to be
2736 // precise. That doesn't take part in the LibCall so we can't directly use
2737 // LowerF128Call.
2738 MakeLibCallOptions CallOptions;
2739 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
2741 SDLoc dl(Op);
2742 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
2743 CallOptions, dl, Chain);
2744 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
2745}
2746
2747SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
2748 SelectionDAG &DAG) const {
2749 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2750 // Any additional optimization in this function should be recorded
2751 // in the cost tables.
2752 EVT InVT = Op.getOperand(0).getValueType();
2753 EVT VT = Op.getValueType();
2754 unsigned NumElts = InVT.getVectorNumElements();
2755
2756 // f16 conversions are promoted to f32 when full fp16 is not supported.
2757 if (InVT.getVectorElementType() == MVT::f16 &&
2758 !Subtarget->hasFullFP16()) {
2760 SDLoc dl(Op);
2761 return DAG.getNode(
2762 Op.getOpcode(), dl, Op.getValueType(),
2763 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2764 }
2765
2766 if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2767 SDLoc dl(Op);
2768 SDValue Cv =
2769 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2770 Op.getOperand(0));
2771 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2772 }
2773
2774 if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2775 SDLoc dl(Op);
2776 MVT ExtVT =
2779 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2780 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2781 }
2782
2783 // Type changing conversions are illegal.
2784 return Op;
2785}
2786
2787SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2788 SelectionDAG &DAG) const {
2789 bool IsStrict = Op->isStrictFPOpcode();
2790 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
2791
2792 if (SrcVal.getValueType().isVector())
2793 return LowerVectorFP_TO_INT(Op, DAG);
2794
2795 // f16 conversions are promoted to f32 when full fp16 is not supported.
2796 if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
2797 assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
2798 SDLoc dl(Op);
2799 return DAG.getNode(
2800 Op.getOpcode(), dl, Op.getValueType(),
2802 }
2803
2804 if (SrcVal.getValueType() != MVT::f128) {
2805 // It's legal except when f128 is involved
2806 return Op;
2807 }
2808
2810 if (Op.getOpcode() == ISD::FP_TO_SINT ||
2811 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
2812 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType());
2813 else
2814 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType());
2815
2816 return LowerF128Call(Op, DAG, LC);
2817}
2818
2820 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2821 // Any additional optimization in this function should be recorded
2822 // in the cost tables.
2823 EVT VT = Op.getValueType();
2824 SDLoc dl(Op);
2825 SDValue In = Op.getOperand(0);
2826 EVT InVT = In.getValueType();
2827
2828 if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2829 MVT CastVT =
2830 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
2831 InVT.getVectorNumElements());
2832 In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2833 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2834 }
2835
2836 if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2837 unsigned CastOpc =
2840 In = DAG.getNode(CastOpc, dl, CastVT, In);
2841 return DAG.getNode(Op.getOpcode(), dl, VT, In);
2842 }
2843
2844 return Op;
2845}
2846
2847SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2848 SelectionDAG &DAG) const {
2849 if (Op.getValueType().isVector())
2850 return LowerVectorINT_TO_FP(Op, DAG);
2851
2852 bool IsStrict = Op->isStrictFPOpcode();
2853 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
2854
2855 // f16 conversions are promoted to f32 when full fp16 is not supported.
2856 if (Op.getValueType() == MVT::f16 &&
2857 !Subtarget->hasFullFP16()) {
2858 assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
2859 SDLoc dl(Op);
2860 return DAG.getNode(
2862 DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
2863 DAG.getIntPtrConstant(0, dl));
2864 }
2865
2866 // i128 conversions are libcalls.
2867 if (SrcVal.getValueType() == MVT::i128)
2868 return SDValue();
2869
2870 // Other conversions are legal, unless it's to the completely software-based
2871 // fp128.
2872 if (Op.getValueType() != MVT::f128)
2873 return Op;
2874
2876 if (Op.getOpcode() == ISD::SINT_TO_FP ||
2877 Op.getOpcode() == ISD::STRICT_SINT_TO_FP)
2878 LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType());
2879 else
2880 LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType());
2881
2882 return LowerF128Call(Op, DAG, LC);
2883}
2884
2885SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2886 SelectionDAG &DAG) const {
2887 // For iOS, we want to call an alternative entry point: __sincos_stret,
2888 // which returns the values in two S / D registers.
2889 SDLoc dl(Op);
2890 SDValue Arg = Op.getOperand(0);
2891 EVT ArgVT = Arg.getValueType();
2892 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2893
2894 ArgListTy Args;
2895 ArgListEntry Entry;
2896
2897 Entry.Node = Arg;
2898 Entry.Ty = ArgTy;
2899 Entry.IsSExt = false;
2900 Entry.IsZExt = false;
2901 Args.push_back(Entry);
2902
2903 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2904 : RTLIB::SINCOS_STRET_F32;
2905 const char *LibcallName = getLibcallName(LC);
2906 SDValue Callee =
2908
2911 CLI.setDebugLoc(dl)
2912 .setChain(DAG.getEntryNode())
2913 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2914
2915 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2916 return CallResult.first;
2917}
2918
2920 EVT OpVT = Op.getValueType();
2921 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
2922 return SDValue();
2923
2924 assert(Op.getOperand(0).getValueType() == MVT::i16);
2925 SDLoc DL(Op);
2926
2927 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2928 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2929 return SDValue(
2930 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
2931 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2932 0);
2933}
2934
2936 if (OrigVT.getSizeInBits() >= 64)
2937 return OrigVT;
2938
2939 assert(OrigVT.isSimple() && "Expecting a simple value type");
2940
2941 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2942 switch (OrigSimpleTy) {
2943 default: llvm_unreachable("Unexpected Vector Type");
2944 case MVT::v2i8:
2945 case MVT::v2i16:
2946 return MVT::v2i32;
2947 case MVT::v4i8:
2948 return MVT::v4i16;
2949 }
2950}
2951
2953 const EVT &OrigTy,
2954 const EVT &ExtTy,
2955 unsigned ExtOpcode) {
2956 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2957 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2958 // 64-bits we need to insert a new extension so that it will be 64-bits.
2959 assert(ExtTy.is128BitVector() && "Unexpected extension size");
2960 if (OrigTy.getSizeInBits() >= 64)
2961 return N;
2962
2963 // Must extend size to at least 64 bits to be used as an operand for VMULL.
2964 EVT NewVT = getExtensionTo64Bits(OrigTy);
2965
2966 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2967}
2968
2970 bool isSigned) {
2971 EVT VT = N->getValueType(0);
2972
2973 if (N->getOpcode() != ISD::BUILD_VECTOR)
2974 return false;
2975
2976 for (const SDValue &Elt : N->op_values()) {
2978 unsigned EltSize = VT.getScalarSizeInBits();
2979 unsigned HalfSize = EltSize / 2;
2980 if (isSigned) {
2981 if (!isIntN(HalfSize, C->getSExtValue()))
2982 return false;
2983 } else {
2984 if (!isUIntN(HalfSize, C->getZExtValue()))
2985 return false;
2986 }
2987 continue;
2988 }
2989 return false;
2990 }
2991
2992 return true;
2993}
2994
2996 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2997 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
2998 N->getOperand(0)->getValueType(0),
2999 N->getValueType(0),
3000 N->getOpcode());
3001
3002 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
3003 EVT VT = N->getValueType(0);
3004 SDLoc dl(N);
3005 unsigned EltSize = VT.getScalarSizeInBits() / 2;
3006 unsigned NumElts = VT.getVectorNumElements();
3007 MVT TruncVT = MVT::getIntegerVT(EltSize);
3009 for (unsigned i = 0; i != NumElts; ++i) {
3010 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
3011 const APInt &CInt = C->getAPIntValue();
3012 // Element types smaller than 32 bits are not legal, so use i32 elements.
3013 // The values are implicitly truncated so sext vs. zext doesn't matter.
3014 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
3015 }
3016 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
3017}
3018
3020 return N->getOpcode() == ISD::SIGN_EXTEND ||
3021 isExtendedBUILD_VECTOR(N, DAG, true);
3022}
3023
3025 return N->getOpcode() == ISD::ZERO_EXTEND ||
3026 isExtendedBUILD_VECTOR(N, DAG, false);
3027}
3028
3029static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
3030 unsigned Opcode = N->getOpcode();
3031 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3032 SDNode *N0 = N->getOperand(0).getNode();
3033 SDNode *N1 = N->getOperand(1).getNode();
3034 return N0->hasOneUse() && N1->hasOneUse() &&
3035 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
3036 }
3037 return false;
3038}
3039
3040static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
3041 unsigned Opcode = N->getOpcode();
3042 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3043 SDNode *N0 = N->getOperand(0).getNode();
3044 SDNode *N1 = N->getOperand(1).getNode();
3045 return N0->hasOneUse() && N1->hasOneUse() &&
3046 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
3047 }
3048 return false;
3049}
3050
3051SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
3052 SelectionDAG &DAG) const {
3053 // The rounding mode is in bits 23:22 of the FPSCR.
3054 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
3055 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
3056 // so that the shift + and get folded into a bitfield extract.
3057 SDLoc dl(Op);
3058
3059 SDValue Chain = Op.getOperand(0);
3060 SDValue FPCR_64 = DAG.getNode(
3062 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
3063 Chain = FPCR_64.getValue(1);
3066 DAG.getConstant(1U << 22, dl, MVT::i32));
3068 DAG.getConstant(22, dl, MVT::i32));
3070 DAG.getConstant(3, dl, MVT::i32));
3071 return DAG.getMergeValues({AND, Chain}, dl);
3072}
3073
3075 // Multiplications are only custom-lowered for 128-bit vectors so that
3076 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
3077 EVT VT = Op.getValueType();
3078 assert(VT.is128BitVector() && VT.isInteger() &&
3079 "unexpected type for custom-lowering ISD::MUL");
3080 SDNode *N0 = Op.getOperand(0).getNode();
3081 SDNode *N1 = Op.getOperand(1).getNode();
3082 unsigned NewOpc = 0;
3083 bool isMLA = false;
3084 bool isN0SExt = isSignExtended(N0, DAG);
3085 bool isN1SExt = isSignExtended(N1, DAG);
3086 if (isN0SExt && isN1SExt)
3088 else {
3089 bool isN0ZExt = isZeroExtended(N0, DAG);
3090 bool isN1ZExt = isZeroExtended(N1, DAG);
3091 if (isN0ZExt && isN1ZExt)
3093 else if (isN1SExt || isN1ZExt) {
3094 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
3095 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
3096 if (isN1SExt && isAddSubSExt(N0, DAG)) {
3098 isMLA = true;
3099 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
3101 isMLA = true;
3102 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
3103 std::swap(N0, N1);
3105 isMLA = true;
3106 }
3107 }
3108
3109 if (!NewOpc) {
3110 if (VT == MVT::v2i64)
3111 // Fall through to expand this. It is not legal.
3112 return SDValue();
3113 else
3114 // Other vector multiplications are legal.
3115 return Op;
3116 }
3117 }
3118
3119 // Legalize to a S/UMULL instruction
3120 SDLoc DL(Op);
3121 SDValue Op0;
3123 if (!isMLA) {
3124 Op0 = skipExtensionForVectorMULL(N0, DAG);
3126 Op1.getValueType().is64BitVector() &&
3127 "unexpected types for extended operands to VMULL");
3128 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
3129 }
3130 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
3131 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
3132 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
3135 EVT Op1VT = Op1.getValueType();
3136 return DAG.getNode(N0->getOpcode(), DL, VT,
3137 DAG.getNode(NewOpc, DL, VT,
3138 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
3139 DAG.getNode(NewOpc, DL, VT,
3140 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
3141}
3142
3143static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
3144 int Pattern) {
3145 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
3147}
3148
3149SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
3150 SelectionDAG &DAG) const {
3151 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3152 SDLoc dl(Op);
3153 switch (IntNo) {
3154 default: return SDValue(); // Don't custom lower most intrinsics.
3155 case Intrinsic::thread_pointer: {
3157 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
3158 }
3159 case Intrinsic::aarch64_neon_abs: {
3160 EVT Ty = Op.getValueType();
3161 if (Ty == MVT::i64) {
3163 Op.getOperand(1));
3164 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
3165 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
3166 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
3167 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
3168 } else {
3169 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
3170 }
3171 }
3172 case Intrinsic::aarch64_neon_smax:
3173 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
3174 Op.getOperand(1), Op.getOperand(2));
3175 case Intrinsic::aarch64_neon_umax:
3176 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
3177 Op.getOperand(1), Op.getOperand(2));
3178 case Intrinsic::aarch64_neon_smin:
3179 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
3180 Op.getOperand(1), Op.getOperand(2));
3181 case Intrinsic::aarch64_neon_umin:
3182 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
3183 Op.getOperand(1), Op.getOperand(2));
3184
3185 case Intrinsic::aarch64_sve_sunpkhi:
3186 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
3187 Op.getOperand(1));
3188 case Intrinsic::aarch64_sve_sunpklo:
3189 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
3190 Op.getOperand(1));
3191 case Intrinsic::aarch64_sve_uunpkhi:
3192 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
3193 Op.getOperand(1));
3194 case Intrinsic::aarch64_sve_uunpklo:
3195 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
3196 Op.getOperand(1));
3197 case Intrinsic::aarch64_sve_clasta_n:
3198 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
3199 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3200 case Intrinsic::aarch64_sve_clastb_n:
3201 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
3202 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3203 case Intrinsic::aarch64_sve_lasta:
3204 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
3205 Op.getOperand(1), Op.getOperand(2));
3206 case Intrinsic::aarch64_sve_lastb:
3207 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
3208 Op.getOperand(1), Op.getOperand(2));
3209 case Intrinsic::aarch64_sve_rev:
3210 return DAG.getNode(AArch64ISD::REV, dl, Op.getValueType(),
3211 Op.getOperand(1));
3212 case Intrinsic::aarch64_sve_tbl:
3213 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
3214 Op.getOperand(1), Op.getOperand(2));
3215 case Intrinsic::aarch64_sve_trn1:
3216 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
3217 Op.getOperand(1), Op.getOperand(2));
3218 case Intrinsic::aarch64_sve_trn2:
3219 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
3220 Op.getOperand(1), Op.getOperand(2));
3221 case Intrinsic::aarch64_sve_uzp1:
3222 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
3223 Op.getOperand(1), Op.getOperand(2));
3224 case Intrinsic::aarch64_sve_uzp2:
3225 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
3226 Op.getOperand(1), Op.getOperand(2));
3227 case Intrinsic::aarch64_sve_zip1:
3228 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
3229 Op.getOperand(1), Op.getOperand(2));
3230 case Intrinsic::aarch64_sve_zip2:
3231 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
3232 Op.getOperand(1), Op.getOperand(2));
3233 case Intrinsic::aarch64_sve_ptrue:
3234 return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
3235 Op.getOperand(1));
3236 case Intrinsic::aarch64_sve_dupq_lane:
3237 return LowerDUPQLane(Op, DAG);
3238 case Intrinsic::aarch64_sve_convert_from_svbool:
3239 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
3240 Op.getOperand(1));
3241 case Intrinsic::aarch64_sve_convert_to_svbool: {
3242 EVT OutVT = Op.getValueType();
3243 EVT InVT = Op.getOperand(1).getValueType();
3244 // Return the operand if the cast isn't changing type,
3245 // i.e. <n x 16 x i1> -> <n x 16 x i1>
3246 if (InVT == OutVT)
3247 return Op.getOperand(1);
3248 // Otherwise, zero the newly introduced lanes.
3250 DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1));
3251 SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all);
3255 }
3256
3257 case Intrinsic::aarch64_sve_insr: {
3258 SDValue Scalar = Op.getOperand(2);
3259 EVT ScalarTy = Scalar.getValueType();
3260 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
3261 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
3262
3263 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
3264 Op.getOperand(1), Scalar);
3265 }
3266
3267 case Intrinsic::localaddress: {
3268 const auto &MF = DAG.getMachineFunction();
3269 const auto *RegInfo = Subtarget->getRegisterInfo();
3270 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
3271 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
3272 Op.getSimpleValueType());
3273 }
3274
3275 case Intrinsic::eh_recoverfp: {
3276 // FIXME: This needs to be implemented to correctly handle highly aligned
3277 // stack objects. For now we simply return the incoming FP. Refer D53541
3278 // for more details.
3279 SDValue FnOp = Op.getOperand(1);
3280 SDValue IncomingFPOp = Op.getOperand(2);
3282 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
3283 if (!Fn)
3285 "llvm.eh.recoverfp must take a function as the first argument");
3286 return IncomingFPOp;
3287 }
3288
3289 case Intrinsic::aarch64_neon_vsri:
3290 case Intrinsic::aarch64_neon_vsli: {
3291 EVT Ty = Op.getValueType();
3292
3293 if (!Ty.isVector())
3294 report_fatal_error("Unexpected type for aarch64_neon_vsli");
3295
3296 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
3297
3298 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
3299 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
3300 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
3301 Op.getOperand(3));
3302 }
3303
3304 case Intrinsic::aarch64_neon_srhadd:
3305 case Intrinsic::aarch64_neon_urhadd: {
3306 bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd;
3308 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
3309 Op.getOperand(2));
3310 }
3311 }
3312}
3313
3314bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
3315 return ExtVal.getValueType().isScalableVector();
3316}
3317
3318// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
3320 EVT VT, EVT MemVT,
3321 SelectionDAG &DAG) {
3322 assert(VT.isVector() && "VT should be a vector type");
3323 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
3324
3325 SDValue Value = ST->getValue();
3326
3327 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
3328 // the word lane which represent the v4i8 subvector. It optimizes the store
3329 // to:
3330 //
3331 // xtn v0.8b, v0.8h
3332 // str s0, [x0]
3333
3334 SDValue Undef = DAG.getUNDEF(MVT::i16);
3336 {Undef, Undef, Undef, Undef});
3337
3339 Value, UndefVec);
3341
3342 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
3344 Trunc, DAG.getConstant(0, DL, MVT::i64));
3345
3346 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
3347 ST->getBasePtr(), ST->getMemOperand());
3348}
3349
3350// Custom lowering for any store, vector or scalar and/or default or with
3351// a truncate operations. Currently only custom lower truncate operation
3352// from vector v4i16 to v4i8 or volatile stores of i128.
3353SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
3354 SelectionDAG &DAG) const {
3355 SDLoc Dl(Op);
3357 assert (StoreNode && "Can only custom lower store nodes");
3358
3359 SDValue Value = StoreNode->getValue();
3360
3361 EVT VT = Value.getValueType();
3362 EVT MemVT = StoreNode->getMemoryVT();
3363
3364 if (VT.isVector()) {
3365 if (useSVEForFixedLengthVectorVT(VT))
3366 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
3367
3368 unsigned AS = StoreNode->getAddressSpace();
3369 Align Alignment = StoreNode->getAlign();
3370 if (Alignment < MemVT.getStoreSize() &&
3371 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
3372 StoreNode->getMemOperand()->getFlags(),
3373 nullptr)) {
3374 return scalarizeVectorStore(StoreNode, DAG);
3375 }
3376
3377 if (StoreNode->isTruncatingStore()) {
3378 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
3379 }
3380 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
3381 // the custom lowering, as there are no un-paired non-temporal stores and
3382 // legalization will break up 256 bit inputs.
3383 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
3384 MemVT.getVectorElementCount().Min % 2u == 0 &&
3385 ((MemVT.getScalarSizeInBits() == 8u ||
3386 MemVT.getScalarSizeInBits() == 16u ||
3387 MemVT.getScalarSizeInBits() == 32u ||
3388 MemVT.getScalarSizeInBits() == 64u))) {
3389 SDValue Lo =
3391 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
3392 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
3393 SDValue Hi = DAG.getNode(
3395 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
3396 StoreNode->getValue(),
3397 DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
3400 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
3401 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
3402 return Result;
3403 }
3404 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
3405 assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
3406 SDValue Lo =
3408 DAG.getConstant(0, Dl, MVT::i64));
3409 SDValue Hi =
3411 DAG.getConstant(1, Dl, MVT::i64));
3414 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
3415 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
3416 return Result;
3417 }
3418
3419 return SDValue();
3420}
3421
3423 SelectionDAG &DAG) const {
3424 LLVM_DEBUG(dbgs() << "Custom lowering: ");
3425 LLVM_DEBUG(Op.dump());
3426
3427 switch (Op.getOpcode()) {
3428 default:
3429 llvm_unreachable("unimplemented operand");
3430 return SDValue();
3431 case ISD::BITCAST:
3432 return LowerBITCAST(Op, DAG);
3433 case ISD::GlobalAddress:
3434 return LowerGlobalAddress(Op, DAG);
3436 return LowerGlobalTLSAddress(Op, DAG);
3437 case ISD::SETCC:
3438 case ISD::STRICT_FSETCC:
3440 return LowerSETCC(Op, DAG);
3441 case ISD::BR_CC:
3442 return LowerBR_CC(Op, DAG);
3443 case ISD::SELECT:
3444 return LowerSELECT(Op, DAG);
3445 case ISD::SELECT_CC:
3446 return LowerSELECT_CC(Op, DAG);
3447 case ISD::JumpTable:
3448 return LowerJumpTable(Op, DAG);
3449 case ISD::BR_JT:
3450 return LowerBR_JT(Op, DAG);
3451 case ISD::ConstantPool:
3452 return LowerConstantPool(Op, DAG);
3453 case ISD::BlockAddress:
3454 return LowerBlockAddress(Op, DAG);
3455 case ISD::VASTART:
3456 return LowerVASTART(Op, DAG);
3457 case ISD::VACOPY:
3458 return LowerVACOPY(Op, DAG);
3459 case ISD::VAARG:
3460 return LowerVAARG(Op, DAG);
3461 case ISD::ADDC:
3462 case ISD::ADDE:
3463 case ISD::SUBC:
3464 case ISD::SUBE:
3465 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
3466 case ISD::SADDO:
3467 case ISD::UADDO:
3468 case ISD::SSUBO:
3469 case ISD::USUBO:
3470 case ISD::SMULO:
3471 case ISD::UMULO:
3472 return LowerXALUO(Op, DAG);
3473 case ISD::FADD:
3474 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3475 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
3476 return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
3477 case ISD::FSUB:
3478 return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
3479 case ISD::FMUL:
3480 return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
3481 case ISD::FMA:
3482 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
3483 case ISD::FDIV:
3484 return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
3485 case ISD::FP_ROUND:
3487 return LowerFP_ROUND(Op, DAG);
3488 case ISD::FP_EXTEND:
3489 return LowerFP_EXTEND(Op, DAG);
3490 case ISD::FRAMEADDR:
3491 return LowerFRAMEADDR(Op, DAG);
3492 case ISD::SPONENTRY:
3493 return LowerSPONENTRY(Op, DAG);
3494 case ISD::RETURNADDR:
3495 return LowerRETURNADDR(Op, DAG);
3497 return LowerADDROFRETURNADDR(Op, DAG);
3499 return LowerINSERT_VECTOR_ELT(Op, DAG);
3501 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3502 case ISD::BUILD_VECTOR:
3503 return LowerBUILD_VECTOR(Op, DAG);
3505 return LowerVECTOR_SHUFFLE(Op, DAG);
3506 case ISD::SPLAT_VECTOR:
3507 return LowerSPLAT_VECTOR(Op, DAG);
3509 return LowerEXTRACT_SUBVECTOR(Op, DAG);
3511 return LowerINSERT_SUBVECTOR(Op, DAG);
3512 case ISD::SDIV:
3513 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED);
3514 case ISD::UDIV:
3515 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED);
3516 case ISD::SMIN:
3517 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1);
3518 case ISD::UMIN:
3519 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1);
3520 case ISD::SMAX:
3521 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1);
3522 case ISD::UMAX:
3523 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1);
3524 case ISD::SRA:
3525 case ISD::SRL:
3526 case ISD::SHL:
3527 return LowerVectorSRA_SRL_SHL(Op, DAG);
3528 case ISD::SHL_PARTS:
3529 return LowerShiftLeftParts(Op, DAG);
3530 case ISD::SRL_PARTS:
3531 case ISD::SRA_PARTS:
3532 return LowerShiftRightParts(Op, DAG);
3533 case ISD::CTPOP:
3534 return LowerCTPOP(Op, DAG);
3535 case ISD::FCOPYSIGN:
3536 return LowerFCOPYSIGN(Op, DAG);
3537 case ISD::OR:
3538 return LowerVectorOR(Op, DAG);
3539 case ISD::XOR:
3540 return LowerXOR(Op, DAG);
3541 case ISD::PREFETCH:
3542 return LowerPREFETCH(Op, DAG);
3543 case ISD::SINT_TO_FP:
3544 case ISD::UINT_TO_FP:
3547 return LowerINT_TO_FP(Op, DAG);
3548 case ISD::FP_TO_SINT:
3549 case ISD::FP_TO_UINT:
3552 return LowerFP_TO_INT(Op, DAG);
3553 case ISD::FSINCOS:
3554 return LowerFSINCOS(Op, DAG);
3555 case ISD::FLT_ROUNDS_:
3556 return LowerFLT_ROUNDS_(Op, DAG);
3557 case ISD::MUL:
3558 return LowerMUL(Op, DAG);
3560 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3561 case ISD::STORE:
3562 return LowerSTORE(Op, DAG);
3563 case ISD::VECREDUCE_ADD:
3570 return LowerVECREDUCE(Op, DAG);
3572 return LowerATOMIC_LOAD_SUB(Op, DAG);
3574 return LowerATOMIC_LOAD_AND(Op, DAG);
3576 return LowerDYNAMIC_STACKALLOC(Op, DAG);
3577 case ISD::VSCALE:
3578 return LowerVSCALE(Op, DAG);
3579 case ISD::TRUNCATE:
3580 return LowerTRUNCATE(Op, DAG);
3581 case ISD::LOAD:
3582 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3583 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
3584 llvm_unreachable("Unexpected request to lower ISD::LOAD");
3585 case ISD::ADD:
3586 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3587 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
3588 llvm_unreachable("Unexpected request to lower ISD::ADD");
3589 }
3590}
3591
3592bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
3593 // Prefer NEON unless larger SVE registers are available.
3594 return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
3595}
3596
3597bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
3598 if (!useSVEForFixedLengthVectors())
3599 return false;
3600
3601 if (!VT.isFixedLengthVector())
3602 return false;
3603
3604 // Fixed length predicates should be promoted to i8.
3605 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
3606 if (VT.getVectorElementType() == MVT::i1)
3607 return false;
3608
3609 // Don't use SVE for vectors we cannot scalarize if required.
3610 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
3611 default:
3612 return false;
3613 case MVT::i8:
3614 case MVT::i16:
3615 case MVT::i32:
3616 case MVT::i64:
3617 case MVT::f16:
3618 case MVT::f32:
3619 case MVT::f64:
3620 break;
3621 }
3622
3623 // Ensure NEON MVTs only belong to a single register class.
3624 if (VT.getSizeInBits() <= 128)
3625 return false;
3626
3627 // Don't use SVE for types that don't fit.
3628 if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
3629 return false;
3630
3631 // TODO: Perhaps an artificial restriction, but worth having whilst getting
3632 // the base fixed length SVE support in place.
3633 if (!VT.isPow2VectorType())
3634 return false;
3635
3636 return true;
3637}
3638
3639//===----------------------------------------------------------------------===//
3640// Calling Convention Implementation
3641//===----------------------------------------------------------------------===//
3642
3643/// Selects the correct CCAssignFn for a given CallingConvention value.
3645 bool IsVarArg) const {
3646 switch (CC) {
3647 default:
3648 report_fatal_error("Unsupported calling convention.");
3650 return CC_AArch64_WebKit_JS;
3651 case CallingConv::GHC:
3652 return CC_AArch64_GHC;
3653 case CallingConv::C:
3654 case CallingConv::Fast:
3657 case CallingConv::Swift:
3658 if (Subtarget->isTargetWindows() && IsVarArg)
3660 if (!Subtarget->isTargetDarwin())
3661 return CC_AArch64_AAPCS;
3662 if (!IsVarArg)
3663 return CC_AArch64_DarwinPCS;
3666 case CallingConv::Win64:
3667 return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
3672 return CC_AArch64_AAPCS;
3673 }
3674}
3675
3676CCAssignFn *
3681
3682SDValue AArch64TargetLowering::LowerFormalArguments(
3683 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3684 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3685 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3687 MachineFrameInfo &MFI = MF.getFrameInfo();
3688 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3689
3690 // Assign locations to all of the incoming arguments.
3693 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3694 *DAG.getContext());
3695
3696 // At this point, Ins[].VT may already be promoted to i32. To correctly
3697 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3698 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3699 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
3700 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
3701 // LocVT.
3702 unsigned NumArgs = Ins.size();
3704 unsigned CurArgIdx = 0;
3705 for (unsigned i = 0; i != NumArgs; ++i) {
3706 MVT ValVT = Ins[i].VT;
3707 if (Ins[i].isOrigArg()) {
3708 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
3709 CurArgIdx = Ins[i].getOrigArgIndex();
3710
3711 // Get type of the original argument.
3712 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
3713 /*AllowUnknown*/ true);
3714 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
3715 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3716 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3717 ValVT = MVT::i8;
3718 else if (ActualMVT == MVT::i16)
3719 ValVT = MVT::i16;
3720 }
3721 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3722 bool Res =
3723 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
3724 assert(!Res && "Call operand has unhandled type");
3725 (void)Res;
3726 }
3727 assert(ArgLocs.size() == Ins.size());
3729 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3730 CCValAssign &VA = ArgLocs[i];
3731
3732 if (Ins[i].Flags.isByVal()) {
3733 // Byval is used for HFAs in the PCS, but the system should work in a
3734 // non-compliant manner for larger structs.
3736 int Size = Ins[i].Flags.getByValSize();
3737 unsigned NumRegs = (Size + 7) / 8;
3738
3739 // FIXME: This works on big-endian for composite byvals, which are the common
3740 // case. It should also work for fundamental types too.
3741 unsigned FrameIdx =
3742 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
3743 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
3744 InVals.push_back(FrameIdxN);
3745
3746 continue;
3747 }
3748
3749 SDValue ArgValue;
3750 if (VA.isRegLoc()) {
3751 // Arguments stored in registers.
3752 EVT RegVT = VA.getLocVT();
3753 const TargetRegisterClass *RC;
3754
3755 if (RegVT == MVT::i32)
3756 RC = &AArch64::GPR32RegClass;
3757 else if (RegVT == MVT::i64)
3758 RC = &AArch64::GPR64RegClass;
3759 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
3760 RC = &AArch64::FPR16RegClass;
3761 else if (RegVT == MVT::f32)
3762 RC = &AArch64::FPR32RegClass;
3763 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
3764 RC = &AArch64::FPR64RegClass;
3765 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
3766 RC = &AArch64::FPR128RegClass;
3767 else if (RegVT.isScalableVector() &&
3768 RegVT.getVectorElementType() == MVT::i1)
3769 RC = &AArch64::PPRRegClass;
3770 else if (RegVT.isScalableVector())
3771 RC = &AArch64::ZPRRegClass;
3772 else
3773 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3774
3775 // Transform the arguments in physical registers into virtual ones.
3776 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3777 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
3778
3779 // If this is an 8, 16 or 32-bit value, it is really passed promoted
3780 // to 64 bits. Insert an assert[sz]ext to capture this, then
3781 // truncate to the right size.
3782 switch (VA.getLocInfo()) {
3783 default:
3784 llvm_unreachable("Unknown loc info!");
3785 case CCValAssign::Full:
3786 break;
3788 assert(VA.getValVT().isScalableVector() &&
3789 "Only scalable vectors can be passed indirectly");
3790 break;
3791 case CCValAssign::BCvt:
3792 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
3793 break;
3794 case CCValAssign::AExt:
3795 case CCValAssign::SExt:
3796 case CCValAssign::ZExt:
3797 break;
3799 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
3800 DAG.getConstant(32, DL, RegVT));
3801 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
3802 break;
3803 }
3804 } else { // VA.isRegLoc()
3805 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
3806 unsigned ArgOffset = VA.getLocMemOffset();
3807 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
3808 ? VA.getLocVT().getSizeInBits()
3809 : VA.getValVT().getSizeInBits()) / 8;
3810
3811 uint32_t BEAlign = 0;
3812 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
3813 !Ins[i].Flags.isInConsecutiveRegs())
3814 BEAlign = 8 - ArgSize;
3815
3816 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
3817
3818 // Create load nodes to retrieve arguments from the stack.
3820
3821 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
3823 MVT MemVT = VA.getValVT();
3824
3825 switch (VA.getLocInfo()) {
3826 default:
3827 break;
3828 case CCValAssign::Trunc:
3829 case CCValAssign::BCvt:
3830 MemVT = VA.getLocVT();
3831 break;
3833 assert(VA.getValVT().isScalableVector() &&
3834 "Only scalable vectors can be passed indirectly");
3835 MemVT = VA.getLocVT();
3836 break;
3837 case CCValAssign::SExt:
3838 ExtType = ISD::SEXTLOAD;
3839 break;
3840 case CCValAssign::ZExt:
3841 ExtType = ISD::ZEXTLOAD;
3842 break;
3843 case CCValAssign::AExt:
3844 ExtType = ISD::EXTLOAD;
3845 break;
3846 }
3847
3848 ArgValue = DAG.getExtLoad(
3849 ExtType, DL, VA.getLocVT(), Chain, FIN,
3851 MemVT);
3852
3853 }
3854
3855 if (VA.getLocInfo() == CCValAssign::Indirect) {
3856 assert(VA.getValVT().isScalableVector() &&
3857 "Only scalable vectors can be passed indirectly");
3858 // If value is passed via pointer - do a load.
3859 ArgValue =
3860 DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
3861 }
3862
3863 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
3864 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
3865 ArgValue, DAG.getValueType(MVT::i32));
3866 InVals.push_back(ArgValue);
3867 }
3868
3869 // varargs
3871 if (isVarArg) {
3872 if (!Subtarget->isTargetDarwin() || IsWin64) {
3873 // The AAPCS variadic function ABI is identical to the non-variadic
3874 // one. As a result there may be more arguments in registers and we should
3875 // save them for future reference.
3876 // Win64 variadic functions also pass arguments in registers, but all float
3877 // arguments are passed in integer registers.
3878 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
3879 }
3880
3881 // This will point to the next argument passed via stack.
3882 unsigned StackOffset = CCInfo.getNextStackOffset();
3883 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
3884 StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
3885 FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
3886
3887 if (MFI.hasMustTailInVarArgFunc()) {
3889 RegParmTypes.push_back(MVT::i64);
3890 RegParmTypes.push_back(MVT::f128);
3891 // Compute the set of forwarded registers. The rest are scratch.
3893 FuncInfo->getForwardedMustTailRegParms();
3894 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
3896
3897 // Conservatively forward X8, since it might be used for aggregate return.
3898 if (!CCInfo.isAllocated(AArch64::X8)) {
3899 unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
3900 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
3901 }
3902 }
3903 }
3904
3905 // On Windows, InReg pointers must be returned, so record the pointer in a
3906 // virtual register at the start of the function so it can be returned in the
3907 // epilogue.
3908 if (IsWin64) {
3909 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3910 if (Ins[I].Flags.isInReg()) {
3911 assert(!FuncInfo->getSRetReturnReg());
3912
3913 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3914 Register Reg =
3916 FuncInfo->setSRetReturnReg(Reg);
3917
3918 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
3919 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
3920 break;
3921 }
3922 }
3923 }
3924
3925 unsigned StackArgSize = CCInfo.getNextStackOffset();
3927 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
3928 // This is a non-standard ABI so by fiat I say we're allowed to make full
3929 // use of the stack area to be popped, which must be aligned to 16 bytes in
3930 // any case:
3932
3933 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
3934 // a multiple of 16.
3936
3937 // This realignment carries over to the available bytes below. Our own
3938 // callers will guarantee the space is free by giving an aligned value to
3939 // CALLSEQ_START.
3940 }
3941 // Even if we're not expected to free up the space, it's useful to know how
3942 // much is there while considering tail calls (because we can reuse it).
3944
3945 if (Subtarget->hasCustomCallingConv())
3947
3948 return Chain;
3949}
3950
3951void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
3952 SelectionDAG &DAG,
3953 const SDLoc &DL,
3954 SDValue &Chain) const {
3956 MachineFrameInfo &MFI = MF.getFrameInfo();
3958 auto PtrVT = getPointerTy(DAG.getDataLayout());
3959 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3960
3962
3963 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
3964 AArch64::X3, AArch64::X4, AArch64::X5,
3965 AArch64::X6, AArch64::X7 };
3966 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
3968
3969 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
3970 int GPRIdx = 0;
3971 if (GPRSaveSize != 0) {
3972 if (IsWin64) {
3973 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
3974 if (GPRSaveSize & 15)
3975 // The extra size here, if triggered, will always be 8.
3976 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
3977 } else
3978 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
3979
3980 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
3981
3982 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
3983 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
3984 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
3985 SDValue Store = DAG.getStore(
3986 Val.getValue(1), DL, Val, FIN,
3987 IsWin64
3989 GPRIdx,
3990 (i - FirstVariadicGPR) * 8)
3991 : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
3992 MemOps.push_back(Store);
3993 FIN =
3994 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
3995 }
3996 }
3997 FuncInfo->setVarArgsGPRIndex(GPRIdx);
3998 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
3999
4000 if (Subtarget->hasFPARMv8() && !IsWin64) {
4001 static const MCPhysReg FPRArgRegs[] = {
4002 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
4003 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
4004 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
4006
4007 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
4008 int FPRIdx = 0;
4009 if (FPRSaveSize != 0) {
4010 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
4011
4013
4014 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
4015 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
4016 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
4017
4018 SDValue Store = DAG.getStore(
4019 Val.getValue(1), DL, Val, FIN,
4021 MemOps.push_back(Store);
4022 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
4023 DAG.getConstant(16, DL, PtrVT));
4024 }
4025 }
4026 FuncInfo->setVarArgsFPRIndex(FPRIdx);
4027 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
4028 }
4029
4030 if (!MemOps.empty()) {
4032 }
4033}
4034
4035/// LowerCallResult - Lower the result values of a call into the
4036/// appropriate copies out of appropriate physical registers.
4037SDValue AArch64TargetLowering::LowerCallResult(
4038 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
4039 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
4041 SDValue ThisVal) const {
4045 // Assign locations to each value returned by this call.
4048 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
4049 *DAG.getContext());
4050 CCInfo.AnalyzeCallResult(Ins, RetCC);
4051
4052 // Copy all of the result registers out of their specified physreg.
4053 for (unsigned i = 0; i != RVLocs.size(); ++i) {
4055
4056 // Pass 'this' value directly from the argument to return value, to avoid
4057 // reg unit interference
4058 if (i == 0 && isThisReturn) {
4059 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
4060 "unexpected return calling convention register assignment");
4061 InVals.push_back(ThisVal);
4062 continue;
4063 }
4064
4065 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
4066 // allows one use of a physreg per block.
4067 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
4068 if (!Val) {
4069 Val =
4070 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
4071 Chain = Val.getValue(1);
4072 InFlag = Val.getValue(2);
4073 CopiedRegs[VA.getLocReg()] = Val;
4074 }
4075
4076 switch (VA.getLocInfo()) {
4077 default:
4078 llvm_unreachable("Unknown loc info!");
4079 case CCValAssign::Full:
4080 break;
4081 case CCValAssign::BCvt:
4082 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
4083 break;
4085 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
4086 DAG.getConstant(32, DL, VA.getLocVT()));
4088 case CCValAssign::AExt:
4090 case CCValAssign::ZExt:
4091 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
4092 break;
4093 }
4094
4095 InVals.push_back(Val);
4096 }
4097
4098 return Chain;
4099}
4100
4101/// Return true if the calling convention is one that we can guarantee TCO for.
4103 return CC == CallingConv::Fast;
4104}
4105
4106/// Return true if we might ever do TCO for calls with this calling convention.
4108 switch (CC) {
4109 case CallingConv::C:
4112 case CallingConv::Swift:
4113 return true;
4114 default:
4115 return canGuaranteeTCO(CC);
4116 }
4117}
4118
4119bool AArch64TargetLowering::isEligibleForTailCallOptimization(
4120 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4122 const SmallVectorImpl<SDValue> &OutVals,
4123 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4125 return false;
4126
4128 const Function &CallerF = MF.getFunction();
4129 CallingConv::ID CallerCC = CallerF.getCallingConv();
4130
4131 // If this function uses the C calling convention but has an SVE signature,
4132 // then it preserves more registers and should assume the SVE_VectorCall CC.
4133 // The check for matching callee-saved regs will determine whether it is
4134 // eligible for TCO.
4135 if (CallerCC == CallingConv::C &&
4138
4139 bool CCMatch = CallerCC == CalleeCC;
4140
4141 // When using the Windows calling convention on a non-windows OS, we want
4142 // to back up and restore X18 in such functions; we can't do a tail call
4143 // from those functions.
4144 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
4146 return false;
4147
4148 // Byval parameters hand the function a pointer directly into the stack area
4149 // we want to reuse during a tail call. Working around this *is* possible (see
4150 // X86) but less efficient and uglier in LowerCall.
4151 for (Function::const_arg_iterator i = CallerF.arg_begin(),
4152 e = CallerF.arg_end();
4153 i != e; ++i) {
4154 if (i->hasByValAttr())
4155 return false;
4156
4157 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
4158 // In this case, it is necessary to save/restore X0 in the callee. Tail
4159 // call opt interferes with this. So we disable tail call opt when the
4160 // caller has an argument with "inreg" attribute.
4161
4162 // FIXME: Check whether the callee also has an "inreg" argument.
4163 if (i->hasInRegAttr())
4164 return false;
4165 }
4166
4168 return canGuaranteeTCO(CalleeCC) && CCMatch;
4169
4170 // Externally-defined functions with weak linkage should not be
4171 // tail-called on AArch64 when the OS does not support dynamic
4172 // pre-emption of symbols, as the AAELF spec requires normal calls
4173 // to undefined weak functions to be replaced with a NOP or jump to the
4174 // next instruction. The behaviour of branch instructions in this
4175 // situation (as used for tail calls) is implementation-defined, so we
4176 // cannot rely on the linker replacing the tail call with a return.
4178 const GlobalValue *GV = G->getGlobal();
4180 if (GV->hasExternalWeakLinkage() &&
4181 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
4182 return false;
4183 }
4184
4185 // Now we search for cases where we can use a tail call without changing the
4186 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
4187 // concept.
4188
4189 // I want anyone implementing a new calling convention to think long and hard
4190 // about this assert.
4191 assert((!isVarArg || CalleeCC == CallingConv::C) &&
4192 "Unexpected variadic calling convention");
4193
4194 LLVMContext &C = *DAG.getContext();
4195 if (isVarArg && !Outs.empty()) {
4196 // At least two cases here: if caller is fastcc then we can't have any
4197 // memory arguments (we'd be expected to clean up the stack afterwards). If
4198 // caller is C then we could potentially use its argument area.
4199
4200 // FIXME: for now we take the most conservative of these in both cases:
4201 // disallow all variadic memory operands.
4203 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4204
4205 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
4206 for (const CCValAssign &ArgLoc : ArgLocs)
4207 if (!ArgLoc.isRegLoc())
4208 return false;
4209 }
4210
4211 // Check that the call results are passed in the same way.
4213 CCAssignFnForCall(CalleeCC, isVarArg),
4214 CCAssignFnForCall(CallerCC, isVarArg)))
4215 return false;
4216 // The callee has to preserve all registers the caller needs to preserve.
4217 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
4218 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4219 if (!CCMatch) {
4220 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4221 if (Subtarget->hasCustomCallingConv()) {
4222 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
4223 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
4224 }
4225 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4226 return false;
4227 }
4228
4229 // Nothing more to check if the callee is taking no arguments
4230 if (Outs.empty())
4231 return true;
4232
4234 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4235
4236 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
4237
4238 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
4239
4240 // If any of the arguments is passed indirectly, it must be SVE, so the
4241 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
4242 // allocate space on the stack. That is why we determine this explicitly here
4243 // the call cannot be a tailcall.
4244 if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
4245 assert((A.getLocInfo() != CCValAssign::Indirect ||
4246 A.getValVT().isScalableVector()) &&
4247 "Expected value to be scalable");
4248 return A.getLocInfo() == CCValAssign::Indirect;
4249 }))
4250 return false;
4251
4252 // If the stack arguments for this call do not fit into our own save area then
4253 // the call cannot be made tail.
4254 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
4255 return false;
4256
4257 const MachineRegisterInfo &MRI = MF.getRegInfo();
4259 return false;
4260
4261 return true;
4262}
4263
4264SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
4265 SelectionDAG &DAG,
4266 MachineFrameInfo &MFI,
4267 int ClobberedFI) const {
4269 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
4270 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
4271
4272 // Include the original chain at the beginning of the list. When this is
4273 // used by target LowerCall hooks, this helps legalize find the
4274 // CALLSEQ_BEGIN node.
4275 ArgChains.push_back(Chain);
4276
4277 // Add a chain value for each stack argument corresponding
4279 UE = DAG.getEntryNode().getNode()->use_end();
4280 U != UE; ++U)
4281 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
4282 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
4283 if (FI->getIndex() < 0) {
4284 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
4285 int64_t InLastByte = InFirstByte;
4286 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
4287
4288 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
4289 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
4290 ArgChains.push_back(SDValue(L, 1));
4291 }
4292
4293 // Build a tokenfactor for all the chains.
4294 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
4295}
4296
4297bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
4298 bool TailCallOpt) const {
4300}
4301
4302/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
4303/// and add input and output parameter nodes.
4304SDValue
4305AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
4306 SmallVectorImpl<SDValue> &InVals) const {
4307 SelectionDAG &DAG = CLI.DAG;
4308 SDLoc &DL = CLI.DL;
4309 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
4310 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4312 SDValue Chain = CLI.Chain;
4313 SDValue Callee = CLI.Callee;
4314 bool &IsTailCall = CLI.IsTailCall;
4315 CallingConv::ID CallConv = CLI.CallConv;
4316 bool IsVarArg = CLI.IsVarArg;
4317
4320 bool IsThisReturn = false;
4321
4324 bool IsSibCall = false;
4325
4326 // Check callee args/returns for SVE registers and set calling convention
4327 // accordingly.
4328 if (CallConv == CallingConv::C) {
4329 bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
4330 return Out.VT.isScalableVector();
4331 });
4332 bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
4333 return In.VT.isScalableVector();
4334 });
4335
4338 }
4339
4340 if (IsTailCall) {
4341 // Check if it's really possible to do a tail call.
4342 IsTailCall = isEligibleForTailCallOptimization(
4343 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
4344 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
4345 report_fatal_error("failed to perform tail call elimination on a call "
4346 "site marked musttail");
4347
4348 // A sibling call is one where we're under the usual C ABI and not planning
4349 // to change that but can still do a tail call:
4350 if (!TailCallOpt && IsTailCall)
4351 IsSibCall = true;
4352
4353 if (IsTailCall)
4354 ++NumTailCalls;
4355 }
4356
4357 // Analyze operands of the call, assigning locations to each operand.
4359 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
4360 *DAG.getContext());
4361
4362 if (IsVarArg) {
4363 // Handle fixed and variable vector arguments differently.
4364 // Variable vector arguments always go into memory.
4365 unsigned NumArgs = Outs.size();
4366
4367 for (unsigned i = 0; i != NumArgs; ++i) {
4368 MVT ArgVT = Outs[i].VT;
4369 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
4370 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
4371 /*IsVarArg=*/ !Outs[i].IsFixed);
4372 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
4373 assert(!Res && "Call operand has unhandled type");
4374 (void)Res;
4375 }
4376 } else {
4377 // At this point, Outs[].VT may already be promoted to i32. To correctly
4378 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
4379 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
4380 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
4381 // we use a special version of AnalyzeCallOperands to pass in ValVT and
4382 // LocVT.
4383 unsigned NumArgs = Outs.size();
4384 for (unsigned i = 0; i != NumArgs; ++i) {
4385 MVT ValVT = Outs[i].VT;
4386 // Get type of the original argument.
4388 CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
4389 /*AllowUnknown*/ true);
4390 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
4391 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
4392 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
4393 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
4394 ValVT = MVT::i8;
4395 else if (ActualMVT == MVT::i16)
4396 ValVT = MVT::i16;
4397
4398 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
4399 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
4400 assert(!Res && "Call operand has unhandled type");
4401 (void)Res;
4402 }
4403 }
4404
4405 // Get a count of how many bytes are to be pushed on the stack.
4406 unsigned NumBytes = CCInfo.getNextStackOffset();
4407
4408 if (IsSibCall) {
4409 // Since we're not changing the ABI to make this a tail call, the memory
4410 // operands are already available in the caller's incoming argument space.
4411 NumBytes = 0;
4412 }
4413
4414 // FPDiff is the byte offset of the call's argument area from the callee's.
4415 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4416 // by this amount for a tail call. In a sibling call it must be 0 because the
4417 // caller will deallocate the entire stack and the callee still expects its
4418 // arguments to begin at SP+0. Completely unused for non-tail calls.
4419 int FPDiff = 0;
4420
4421 if (IsTailCall && !IsSibCall) {
4422 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
4423
4424 // Since callee will pop argument stack as a tail call, we must keep the
4425 // popped size 16-byte aligned.
4426 NumBytes = alignTo(NumBytes, 16);
4427
4428 // FPDiff will be negative if this tail call requires more space than we
4429 // would automatically have in our incoming argument space. Positive if we
4430 // can actually shrink the stack.
4431 FPDiff = NumReusableBytes - NumBytes;
4432
4433 // The stack pointer must be 16-byte aligned at all times it's used for a
4434 // memory operation, which in practice means at *all* times and in
4435 // particular across call boundaries. Therefore our own arguments started at
4436 // a 16-byte aligned SP and the delta applied for the tail call should
4437 // satisfy the same constraint.
4438 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
4439 }
4440
4441 // Adjust the stack pointer for the new arguments...
4442 // These operations are automatically eliminated by the prolog/epilog pass
4443 if (!IsSibCall)
4444 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
4445
4446 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
4448
4450 SmallSet<unsigned, 8> RegsUsed;
4452 auto PtrVT = getPointerTy(DAG.getDataLayout());
4453
4454 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
4455 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
4456 for (const auto &F : Forwards) {
4457 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
4458 RegsToPass.emplace_back(F.PReg, Val);
4459 }
4460 }
4461
4462 // Walk the register/memloc assignments, inserting copies/loads.
4463 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4464 CCValAssign &VA = ArgLocs[i];
4465 SDValue Arg = OutVals[i];
4466 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4467
4468 // Promote the value if needed.
4469 switch (VA.getLocInfo()) {
4470 default:
4471 llvm_unreachable("Unknown loc info!");
4472 case CCValAssign::Full:
4473 break;
4474 case CCValAssign::SExt:
4475 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4476 break;
4477 case CCValAssign::ZExt:
4478 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4479 break;
4480 case CCValAssign::AExt:
4481 if (Outs[i].ArgVT == MVT::i1) {
4482 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
4485 }
4486 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4487 break;
4489 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
4490 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4491 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
4492 DAG.getConstant(32, DL, VA.getLocVT()));
4493 break;
4494 case CCValAssign::BCvt:
4495 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
4496 break;
4497 case CCValAssign::Trunc:
4498 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
4499 break;
4500 case CCValAssign::FPExt:
4501 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4502 break;
4504 assert(VA.getValVT().isScalableVector() &&
4505 "Only scalable vectors can be passed indirectly");
4507 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
4508 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
4509 int FI = MFI.CreateStackObject(
4510 VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
4512
4513 SDValue SpillSlot = DAG.getFrameIndex(
4515 Chain = DAG.getStore(
4516 Chain, DL, Arg, SpillSlot,
4518 Arg = SpillSlot;
4519 break;
4520 }
4521
4522 if (VA.isRegLoc()) {
4523 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
4524 Outs[0].VT == MVT::i64) {
4525 assert(VA.getLocVT() == MVT::i64 &&
4526 "unexpected calling convention register assignment");
4527 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
4528 "unexpected use of 'returned'");
4529 IsThisReturn = true;
4530 }
4531 if (RegsUsed.count(VA.getLocReg())) {
4532 // If this register has already been used then we're trying to pack
4533 // parts of an [N x i32] into an X-register. The extension type will
4534 // take care of putting the two halves in the right place but we have to
4535 // combine them.
4536 SDValue &Bits =
4537 std::find_if(RegsToPass.begin(), RegsToPass.end(),
4538 [=](const std::pair<unsigned, SDValue> &Elt) {
4539 return Elt.first == VA.getLocReg();
4540 })
4541 ->second;
4542 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
4543 // Call site info is used for function's parameter entry value
4544 // tracking. For now we track only simple cases when parameter
4545 // is transferred through whole register.
4546 CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(),
4548 return ArgReg.Reg == VA.getLocReg();
4549 }),
4550 CSInfo.end());
4551 } else {
4552 RegsToPass.emplace_back(VA.getLocReg(), Arg);
4553 RegsUsed.insert(VA.getLocReg());
4554 const TargetOptions &Options = DAG.getTarget().Options;
4555 if (Options.EmitCallSiteInfo)
4556 CSInfo.emplace_back(VA.getLocReg(), i);
4557 }
4558 } else {
4559 assert(VA.isMemLoc());
4560
4563
4564 // FIXME: This works on big-endian for composite byvals, which are the
4565 // common case. It should also work for fundamental types too.
4566 uint32_t BEAlign = 0;
4567 unsigned OpSize;
4568 if (VA.getLocInfo() == CCValAssign::Indirect)
4569 OpSize = VA.getLocVT().getSizeInBits();
4570 else
4571 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
4572 : VA.getValVT().getSizeInBits();
4573 OpSize = (OpSize + 7) / 8;
4574 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
4575 !Flags.isInConsecutiveRegs()) {
4576 if (OpSize < 8)
4577 BEAlign = 8 - OpSize;
4578 }
4579 unsigned LocMemOffset = VA.getLocMemOffset();
4580 int32_t Offset = LocMemOffset + BEAlign;
4581 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
4582 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
4583
4584 if (IsTailCall) {
4585 Offset = Offset + FPDiff;
4586 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4587
4588 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4589 DstInfo =
4591
4592 // Make sure any stack arguments overlapping with where we're storing
4593 // are loaded before this eventual operation. Otherwise they'll be
4594 // clobbered.
4595 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
4596 } else {
4597 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
4598
4599 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
4601 LocMemOffset);
4602 }
4603
4604 if (Outs[i].Flags.isByVal()) {
4606 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
4607 SDValue Cpy = DAG.getMemcpy(
4608 Chain, DL, DstAddr, Arg, SizeNode,
4609 Outs[i].Flags.getNonZeroByValAlign(),
4610 /*isVol = */ false, /*AlwaysInline = */ false,
4611 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
4612
4613 MemOpChains.push_back(Cpy);
4614 } else {
4615 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
4616 // promoted to a legal register type i32, we should truncate Arg back to
4617 // i1/i8/i16.
4618 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
4619 VA.getValVT() == MVT::i16)
4620 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
4621
4622 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
4623 MemOpChains.push_back(Store);
4624 }
4625 }
4626 }
4627
4628 if (!MemOpChains.empty())
4630
4631 // Build a sequence of copy-to-reg nodes chained together with token chain
4632 // and flag operands which copy the outgoing args into the appropriate regs.
4634 for (auto &RegToPass : RegsToPass) {
4635 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
4636 RegToPass.second, InFlag);
4637 InFlag = Chain.getValue(1);
4638 }
4639
4640 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
4641 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
4642 // node so that legalize doesn't hack it.
4643 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
4644 auto GV = G->getGlobal();
4645 unsigned OpFlags =
4647 if (OpFlags & AArch64II::MO_GOT) {
4650 } else {
4651 const GlobalValue *GV = G->getGlobal();
4652 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
4653 }
4654 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
4656 Subtarget->isTargetMachO()) {
4657 const char *Sym = S->getSymbol();
4660 } else {
4661 const char *Sym = S->getSymbol();
4662 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
4663 }
4664 }
4665
4666 // We don't usually want to end the call-sequence here because we would tidy
4667 // the frame up *after* the call, however in the ABI-changing tail-call case
4668 // we've carefully laid out the parameters so that when sp is reset they'll be
4669 // in the correct location.
4670 if (IsTailCall && !IsSibCall) {
4671 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
4672 DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
4673 InFlag = Chain.getValue(1);
4674 }
4675
4676 std::vector<SDValue> Ops;
4677 Ops.push_back(Chain);
4678 Ops.push_back(Callee);
4679
4680 if (IsTailCall) {
4681 // Each tail call may have to adjust the stack by a different amount, so
4682 // this information must travel along with the operation for eventual
4683 // consumption by emitEpilogue.
4684 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4685 }
4686
4687 // Add argument registers to the end of the list so that they are known live
4688 // into the call.
4689 for (auto &RegToPass : RegsToPass)
4690 Ops.push_back(DAG.getRegister(RegToPass.first,
4691 RegToPass.second.getValueType()));
4692
4693 // Add a register mask operand representing the call-preserved registers.
4694 const uint32_t *Mask;
4695 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
4696 if (IsThisReturn) {
4697 // For 'this' returns, use the X0-preserving mask if applicable
4698 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
4699 if (!Mask) {
4700 IsThisReturn = false;
4701 Mask = TRI->getCallPreservedMask(MF, CallConv);
4702 }
4703 } else
4704 Mask = TRI->getCallPreservedMask(MF, CallConv);
4705
4706 if (Subtarget->hasCustomCallingConv())
4707 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
4708
4709 if (TRI->isAnyArgRegReserved(MF))
4710 TRI->emitReservedArgRegCallError(MF);
4711
4712 assert(Mask && "Missing call preserved mask for calling convention");
4713 Ops.push_back(DAG.getRegisterMask(Mask));
4714
4715 if (InFlag.getNode())
4716 Ops.push_back(InFlag);
4717
4719
4720 // If we're doing a tall call, use a TC_RETURN here rather than an
4721 // actual call instruction.
4722 if (IsTailCall) {
4725 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4726 return Ret;
4727 }
4728
4729 // Returns a chain and a flag for retval copy to use.
4730 Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
4731 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4732 InFlag = Chain.getValue(1);
4733 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4734
4735 uint64_t CalleePopBytes =
4736 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
4737
4738 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
4740 InFlag, DL);
4741 if (!Ins.empty())
4742 InFlag = Chain.getValue(1);
4743
4744 // Handle result values, copying them out of physregs into vregs that we
4745 // return.
4746 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
4747 InVals, IsThisReturn,
4748 IsThisReturn ? OutVals[0] : SDValue());
4749}
4750
4751bool AArch64TargetLowering::CanLowerReturn(
4752 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
4753 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
4758 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
4759 return CCInfo.CheckReturn(Outs, RetCC);
4760}
4761
4762SDValue
4763AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
4764 bool isVarArg,
4766 const SmallVectorImpl<SDValue> &OutVals,
4767 const SDLoc &DL, SelectionDAG &DAG) const {
4768 auto &MF = DAG.getMachineFunction();
4769 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
4770
4775 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
4776 *DAG.getContext());
4777 CCInfo.AnalyzeReturn(Outs, RetCC);
4778
4779 // Copy the result values into the output registers.
4780 SDValue Flag;
4782 SmallSet<unsigned, 4> RegsUsed;
4783 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
4784 ++i, ++realRVLocIdx) {
4785 CCValAssign &VA = RVLocs[i];
4786 assert(VA.isRegLoc() && "Can only return in registers!");
4787 SDValue Arg = OutVals[realRVLocIdx];
4788
4789 switch (VA.getLocInfo()) {
4790 default:
4791 llvm_unreachable("Unknown loc info!");
4792 case CCValAssign::Full:
4793 if (Outs[i].ArgVT == MVT::i1) {
4794 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
4795 // value. This is strictly redundant on Darwin (which uses "zeroext
4796 // i1"), but will be optimised out before ISel.
4798 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4799 }
4800 break;
4801 case CCValAssign::BCvt:
4802 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4803 break;
4804 case CCValAssign::AExt:
4805 case CCValAssign::ZExt:
4806 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
4807 break;
4809 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
4810 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
4811 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
4812 DAG.getConstant(32, DL, VA.getLocVT()));
4813 break;
4814 }
4815
4816 if (RegsUsed.count(VA.getLocReg())) {
4817 SDValue &Bits =
4818 std::find_if(RetVals.begin(), RetVals.end(),
4819 [=](const std::pair<unsigned, SDValue> &Elt) {
4820 return Elt.first == VA.getLocReg();
4821 })
4822 ->second;
4823 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
4824 } else {
4825 RetVals.emplace_back(VA.getLocReg(), Arg);
4826 RegsUsed.insert(VA.getLocReg());
4827 }
4828 }
4829
4831 for (auto &RetVal : RetVals) {
4832 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
4833 Flag = Chain.getValue(1);
4834 RetOps.push_back(
4835 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
4836 }
4837
4838 // Windows AArch64 ABIs require that for returning structs by value we copy
4839 // the sret argument into X0 for the return.
4840 // We saved the argument into a virtual register in the entry block,
4841 // so now we copy the value out and into X0.
4842 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
4843 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
4845
4846 unsigned RetValReg = AArch64::X0;
4847 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
4848 Flag = Chain.getValue(1);
4849
4850 RetOps.push_back(
4852 }
4853
4854 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
4855 const MCPhysReg *I =
4856 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
4857 if (I) {
4858 for (; *I; ++I) {
4859 if (AArch64::GPR64RegClass.contains(*I))
4860 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
4861 else if (AArch64::FPR64RegClass.contains(*I))
4862 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
4863 else
4864 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
4865 }
4866 }
4867
4868 RetOps[0] = Chain; // Update chain.
4869
4870 // Add the flag if we have it.
4871 if (Flag.getNode())
4872 RetOps.push_back(Flag);
4873
4875}
4876
4877//===----------------------------------------------------------------------===//
4878// Other Lowering Code
4879//===----------------------------------------------------------------------===//
4880
4881SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
4882 SelectionDAG &DAG,
4883 unsigned Flag) const {
4884 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
4885 N->getOffset(), Flag);
4886}
4887
4888SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
4889 SelectionDAG &DAG,
4890 unsigned Flag) const {
4891 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
4892}
4893
4894SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
4895 SelectionDAG &DAG,
4896 unsigned Flag) const {
4897 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
4898 N->getOffset(), Flag);
4899}
4900
4901SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
4902 SelectionDAG &DAG,
4903 unsigned Flag) const {
4904 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
4905}
4906
4907// (loadGOT sym)
4908template <class NodeTy>
4909SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
4910 unsigned Flags) const {
4911 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
4912 SDLoc DL(N);
4913 EVT Ty = getPointerTy(DAG.getDataLayout());
4914 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
4915 // FIXME: Once remat is capable of dealing with instructions with register
4916 // operands, expand this into two nodes instead of using a wrapper node.
4917 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
4918}
4919
4920// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
4921template <class NodeTy>
4922SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
4923 unsigned Flags) const {
4924 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
4925 SDLoc DL(N);
4926 EVT Ty = getPointerTy(DAG.getDataLayout());
4927 const unsigned char MO_NC = AArch64II::MO_NC;
4928 return DAG.getNode(
4930 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
4931 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
4932 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
4933 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
4934}
4935
4936// (addlow (adrp %hi(sym)) %lo(sym))
4937template <class NodeTy>
4938SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
4939 unsigned Flags) const {
4940 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
4941 SDLoc DL(N);
4942 EVT Ty = getPointerTy(DAG.getDataLayout());
4943 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
4944 SDValue Lo = getTargetNode(N, Ty, DAG,
4946 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
4947 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
4948}
4949
4950// (adr sym)
4951template <class NodeTy>
4952SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
4953 unsigned Flags) const {
4954 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
4955 SDLoc DL(N);
4956 EVT Ty = getPointerTy(DAG.getDataLayout());
4957 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
4958 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
4959}
4960
4961SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
4962 SelectionDAG &DAG) const {
4964 const GlobalValue *GV = GN->getGlobal();
4965 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
4966
4969 "unexpected offset in global node");
4970
4971 // This also catches the large code model case for Darwin, and tiny code
4972 // model with got relocations.
4973 if ((OpFlags & AArch64II::MO_GOT) != 0) {
4974 return getGOT(GN, DAG, OpFlags);
4975 }
4976
4979 Result = getAddrLarge(GN, DAG, OpFlags);
4980 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
4981 Result = getAddrTiny(GN, DAG, OpFlags);
4982 } else {
4983 Result = getAddr(GN, DAG, OpFlags);
4984 }
4986 SDLoc DL(GN);
4988 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
4990 return Result;
4991}
4992
4993/// Convert a TLS address reference into the correct sequence of loads
4994/// and calls to compute the variable's address (for Darwin, currently) and
4995/// return an SDValue containing the final node.
4996
4997/// Darwin only has one TLS scheme which must be capable of dealing with the
4998/// fully general situation, in the worst case. This means:
4999/// + "extern __thread" declaration.
5000/// + Defined in a possibly unknown dynamic library.
5001///
5002/// The general system is that each __thread variable has a [3 x i64] descriptor
5003/// which contains information used by the runtime to calculate the address. The
5004/// only part of this the compiler needs to know about is the first xword, which
5005/// contains a function pointer that must be called with the address of the
5006/// entire descriptor in "x0".
5007///
5008/// Since this descriptor may be in a different unit, in general even the
5009/// descriptor must be accessed via an indirect load. The "ideal" code sequence
5010/// is:
5011/// adrp x0, _var@TLVPPAGE
5012/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
5013/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
5014/// ; the function pointer
5015/// blr x1 ; Uses descriptor address in x0
5016/// ; Address of _var is now in x0.
5017///
5018/// If the address of _var's descriptor *is* known to the linker, then it can
5019/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
5020/// a slight efficiency gain.
5021SDValue
5022AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
5023 SelectionDAG &DAG) const {
5024 assert(Subtarget->isTargetDarwin() &&
5025 "This function expects a Darwin target");
5026
5027 SDLoc DL(Op);
5030 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
5031
5035
5036 // The first entry in the descriptor is a function pointer that we must call
5037 // to obtain the address of the variable.
5038 SDValue Chain = DAG.getEntryNode();
5040 PtrMemVT, DL, Chain, DescAddr,
5042 /* Alignment = */ PtrMemVT.getSizeInBits() / 8,
5044 Chain = FuncTLVGet.getValue(1);
5045
5046 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
5048
5050 MFI.setAdjustsStack(true);
5051
5052 // TLS calls preserve all registers except those that absolutely must be
5053 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
5054 // silly).
5055 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5056 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
5057 if (Subtarget->hasCustomCallingConv())
5058 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
5059
5060 // Finally, we can make the call. This is just a degenerate version of a
5061 // normal AArch64 call node: x0 takes the address of the descriptor, and
5062 // returns the address of the variable in this thread.
5063 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
5064 Chain =
5066 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
5067 DAG.getRegisterMask(Mask), Chain.getValue(1));
5068 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
5069}
5070
5071/// Convert a thread-local variable reference into a sequence of instructions to
5072/// compute the variable's address for the local exec TLS model of ELF targets.
5073/// The sequence depends on the maximum TLS area size.
5074SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
5076 const SDLoc &DL,
5077 SelectionDAG &DAG) const {
5079 SDValue TPOff, Addr;
5080
5081 switch (DAG.getTarget().Options.TLSSize) {
5082 default:
5083 llvm_unreachable("Unexpected TLS size");
5084
5085 case 12: {
5086 // mrs x0, TPIDR_EL0
5087 // add x0, x0, :tprel_lo12:a
5090 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
5091 Var,
5092 DAG.getTargetConstant(0, DL, MVT::i32)),
5093 0);
5094 }
5095
5096 case 24: {
5097 // mrs x0, TPIDR_EL0
5098 // add x0, x0, :tprel_hi12:a
5099 // add x0, x0, :tprel_lo12_nc:a
5103 GV, DL, PtrVT, 0,
5105 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
5106 HiVar,
5107 DAG.getTargetConstant(0, DL, MVT::i32)),
5108 0);
5109 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
5110 LoVar,
5111 DAG.getTargetConstant(0, DL, MVT::i32)),
5112 0);
5113 }
5114
5115 case 32: {
5116 // mrs x1, TPIDR_EL0
5117 // movz x0, #:tprel_g1:a
5118 // movk x0, #:tprel_g0_nc:a
5119 // add x0, x1, x0
5123 GV, DL, PtrVT, 0,
5125 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
5126 DAG.getTargetConstant(16, DL, MVT::i32)),
5127 0);
5128 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
5129 DAG.getTargetConstant(0, DL, MVT::i32)),
5130 0);
5131 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
5132 }
5133
5134 case 48: {
5135 // mrs x1, TPIDR_EL0
5136 // movz x0, #:tprel_g2:a
5137 // movk x0, #:tprel_g1_nc:a
5138 // movk x0, #:tprel_g0_nc:a
5139 // add x0, x1, x0
5143 GV, DL, PtrVT, 0,
5146 GV, DL, PtrVT, 0,
5148 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
5149 DAG.getTargetConstant(32, DL, MVT::i32)),
5150 0);
5151 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
5152 DAG.getTargetConstant(16, DL, MVT::i32)),
5153 0);
5154 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
5155 DAG.getTargetConstant(0, DL, MVT::i32)),
5156 0);
5157 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
5158 }
5159 }
5160}
5161
5162/// When accessing thread-local variables under either the general-dynamic or
5163/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
5164/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
5165/// is a function pointer to carry out the resolution.
5166///
5167/// The sequence is:
5168/// adrp x0, :tlsdesc:var
5169/// ldr x1, [x0, #:tlsdesc_lo12:var]
5170/// add x0, x0, #:tlsdesc_lo12:var
5171/// .tlsdesccall var
5172/// blr x1
5173/// (TPIDR_EL0 offset now in x0)
5174///
5175/// The above sequence must be produced unscheduled, to enable the linker to
5176/// optimize/relax this sequence.
5177/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
5178/// above sequence, and expanded really late in the compilation flow, to ensure
5179/// the sequence is produced as per above.
5180SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
5181 const SDLoc &DL,
5182 SelectionDAG &DAG) const {
5184
5185 SDValue Chain = DAG.getEntryNode();
5187
5188 Chain =
5190 SDValue Glue = Chain.getValue(1);
5191
5192 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
5193}
5194
5195SDValue
5196AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
5197 SelectionDAG &DAG) const {
5198 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
5199
5201
5202 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
5203
5205 if (Model == TLSModel::LocalDynamic)
5207 }
5208
5210 Model != TLSModel::LocalExec)
5211 report_fatal_error("ELF TLS only supported in small memory model or "
5212 "in local exec TLS model");
5213 // Different choices can be made for the maximum size of the TLS area for a
5214 // module. For the small address model, the default TLS size is 16MiB and the
5215 // maximum TLS size is 4GiB.
5216 // FIXME: add tiny and large code model support for TLS access models other
5217 // than local exec. We currently generate the same code as small for tiny,
5218 // which may be larger than needed.
5219
5220 SDValue TPOff;
5222 SDLoc DL(Op);
5223 const GlobalValue *GV = GA->getGlobal();
5224
5226
5227 if (Model == TLSModel::LocalExec) {
5228 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
5229 } else if (Model == TLSModel::InitialExec) {
5232 } else if (Model == TLSModel::LocalDynamic) {
5233 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
5234 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
5235 // the beginning of the module's TLS region, followed by a DTPREL offset
5236 // calculation.
5237
5238 // These accesses will need deduplicating if there's more than one.
5239 AArch64FunctionInfo *MFI =
5242
5243 // The call needs a relocation too for linker relaxation. It doesn't make
5244 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
5245 // the address.
5246 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
5248
5249 // Now we can calculate the offset from TPIDR_EL0 to this module's
5250 // thread-local area.
5251 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
5252
5253 // Now use :dtprel_whatever: operations to calculate this variable's offset
5254 // in its thread-storage area.
5258 GV, DL, MVT::i64, 0,
5260
5261 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
5262 DAG.getTargetConstant(0, DL, MVT::i32)),
5263 0);
5264 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
5265 DAG.getTargetConstant(0, DL, MVT::i32)),
5266 0);
5267 } else if (Model == TLSModel::GeneralDynamic) {
5268 // The call needs a relocation too for linker relaxation. It doesn't make
5269 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
5270 // the address.
5273
5274 // Finally we can make a call to calculate the offset from tpidr_el0.
5275 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
5276 } else
5277 llvm_unreachable("Unsupported ELF TLS access model");
5278
5279 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
5280}
5281
5282SDValue
5283AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
5284 SelectionDAG &DAG) const {
5285 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
5286
5287 SDValue Chain = DAG.getEntryNode();
5289 SDLoc DL(Op);
5290
5291 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
5292
5293 // Load the ThreadLocalStoragePointer from the TEB
5294 // A pointer to the TLS array is located at offset 0x58 from the TEB.
5296 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
5298 Chain = TLSArray.getValue(1);
5299
5300 // Load the TLS index from the C runtime;
5301 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
5302 // This also does the same as LOADgot, but using a generic i32 load,
5303 // while LOADgot only loads i64.
5312 Chain = TLSIndex.getValue(1);
5313
5314 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
5315 // offset into the TLSArray.
5318 DAG.getConstant(3, DL, PtrVT));
5319 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
5320 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
5322 Chain = TLS.getValue(1);
5323
5325 const GlobalValue *GV = GA->getGlobal();
5329 GV, DL, PtrVT, 0,
5331
5332 // Add the offset from the start of the .tls section (section base).
5333 SDValue Addr =
5334 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
5335 DAG.getTargetConstant(0, DL, MVT::i32)),
5336 0);
5337 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
5338 return Addr;
5339}
5340
5341SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
5342 SelectionDAG &DAG) const {
5344 if (DAG.getTarget().useEmulatedTLS())
5345 return LowerToTLSEmulatedModel(GA, DAG);
5346
5347 if (Subtarget->isTargetDarwin())
5348 return LowerDarwinGlobalTLSAddress(Op, DAG);
5349 if (Subtarget->isTargetELF())
5350 return LowerELFGlobalTLSAddress(Op, DAG);
5351 if (Subtarget->isTargetWindows())
5352 return LowerWindowsGlobalTLSAddress(Op, DAG);
5353
5354 llvm_unreachable("Unexpected platform trying to use TLS");
5355}
5356
5357SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5358 SDValue Chain = Op.getOperand(0);
5359 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5360 SDValue LHS = Op.getOperand(2);
5361 SDValue RHS = Op.getOperand(3);
5362 SDValue Dest = Op.getOperand(4);
5363 SDLoc dl(Op);
5364
5366 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
5367 // will not be produced, as they are conditional branch instructions that do
5368 // not set flags.
5369 bool ProduceNonFlagSettingCondBr =
5370 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
5371
5372 // Handle f128 first, since lowering it will result in comparing the return
5373 // value of a libcall against zero, which is just what the rest of LowerBR_CC
5374 // is expecting to deal with.
5375 if (LHS.getValueType() == MVT::f128) {
5376 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
5377
5378 // If softenSetCCOperands returned a scalar, we need to compare the result
5379 // against zero to select between true and false values.
5380 if (!RHS.getNode()) {
5381 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5382 CC = ISD::SETNE;
5383 }
5384 }
5385
5386 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5387 // instruction.
5388 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
5389 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5390 // Only lower legal XALUO ops.
5392 return SDValue();
5393
5394 // The actual operation with overflow check.
5397 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
5398
5399 if (CC == ISD::SETNE)
5400 OFCC = getInvertedCondCode(OFCC);
5401 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
5402
5403 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
5404 Overflow);
5405 }
5406
5407 if (LHS.getValueType().isInteger()) {
5408 assert((LHS.getValueType() == RHS.getValueType()) &&
5409 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
5410
5411 // If the RHS of the comparison is zero, we can potentially fold this
5412 // to a specialized branch.
5414 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
5415 if (CC == ISD::SETEQ) {
5416 // See if we can use a TBZ to fold in an AND as well.
5417 // TBZ has a smaller branch displacement than CBZ. If the offset is
5418 // out of bounds, a late MI-layer pass rewrites branches.
5419 // 403.gcc is an example that hits this case.
5420 if (LHS.getOpcode() == ISD::AND &&
5423 SDValue Test = LHS.getOperand(0);
5424 uint64_t Mask = LHS.getConstantOperandVal(1);
5425 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
5426 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
5427 Dest);
5428 }
5429
5430 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
5431 } else if (CC == ISD::SETNE) {
5432 // See if we can use a TBZ to fold in an AND as well.
5433 // TBZ has a smaller branch displacement than CBZ. If the offset is
5434 // out of bounds, a late MI-layer pass rewrites branches.
5435 // 403.gcc is an example that hits this case.
5436 if (LHS.getOpcode() == ISD::AND &&
5439 SDValue Test = LHS.getOperand(0);
5440 uint64_t Mask = LHS.getConstantOperandVal(1);
5441 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
5442 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
5443 Dest);
5444 }
5445
5446 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
5447 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
5448 // Don't combine AND since emitComparison converts the AND to an ANDS
5449 // (a.k.a. TST) and the test in the test bit and branch instruction
5450 // becomes redundant. This would also increase register pressure.
5451 uint64_t Mask = LHS.getValueSizeInBits() - 1;
5452 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
5453 DAG.getConstant(Mask, dl, MVT::i64), Dest);
5454 }
5455 }
5456 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
5457 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
5458 // Don't combine AND since emitComparison converts the AND to an ANDS
5459 // (a.k.a. TST) and the test in the test bit and branch instruction
5460 // becomes redundant. This would also increase register pressure.
5461 uint64_t Mask = LHS.getValueSizeInBits() - 1;
5462 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
5463 DAG.getConstant(Mask, dl, MVT::i64), Dest);
5464 }
5465
5466 SDValue CCVal;
5467 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
5468 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
5469 Cmp);
5470 }
5471
5472 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
5473 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
5474
5475 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
5476 // clean. Some of them require two branches to implement.
5477 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
5481 SDValue BR1 =
5482 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
5483 if (CC2 != AArch64CC::AL) {
5485 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
5486 Cmp);
5487 }
5488
5489 return BR1;
5490}
5491
5492SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
5493 SelectionDAG &DAG) const {
5494 EVT VT = Op.getValueType();
5495 SDLoc DL(Op);
5496
5497 SDValue In1 = Op.getOperand(0);
5498 SDValue In2 = Op.getOperand(1);
5499 EVT SrcVT = In2.getValueType();
5500
5501 if (SrcVT.bitsLT(VT))
5502 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
5503 else if (SrcVT.bitsGT(VT))
5504 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
5505
5506 EVT VecVT;
5507 uint64_t EltMask;
5509
5510 auto setVecVal = [&] (int Idx) {
5511 if (!VT.isVector()) {
5512 VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
5513 DAG.getUNDEF(VecVT), In1);
5514 VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
5515 DAG.getUNDEF(VecVT), In2);
5516 } else {
5517 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
5518 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
5519 }
5520 };
5521
5522 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
5523 VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
5524 EltMask = 0x80000000ULL;
5525 setVecVal(AArch64::ssub);
5526 } else if (VT == MVT::f64 || VT == MVT::v2f64) {
5527 VecVT = MVT::v2i64;
5528
5529 // We want to materialize a mask with the high bit set, but the AdvSIMD
5530 // immediate moves cannot materialize that in a single instruction for
5531 // 64-bit elements. Instead, materialize zero and then negate it.
5532 EltMask = 0;
5533
5534 setVecVal(AArch64::dsub);
5535 } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
5536 VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
5537 EltMask = 0x8000ULL;
5538 setVecVal(AArch64::hsub);
5539 } else {
5540 llvm_unreachable("Invalid type for copysign!");
5541 }
5542
5543 SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
5544
5545 // If we couldn't materialize the mask above, then the mask vector will be
5546 // the zero vector, and we need to negate it here.
5547 if (VT == MVT::f64 || VT == MVT::v2f64) {
5551 }
5552
5553 SDValue Sel =
5555
5556 if (VT == MVT::f16)
5557 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
5558 if (VT == MVT::f32)
5559 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
5560 else if (VT == MVT::f64)
5561 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
5562 else
5563 return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
5564}
5565
5566SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
5568 Attribute::NoImplicitFloat))
5569 return SDValue();
5570
5571 if (!Subtarget->hasNEON())
5572 return SDValue();
5573
5574 // While there is no integer popcount instruction, it can
5575 // be more efficiently lowered to the following sequence that uses
5576 // AdvSIMD registers/instructions as long as the copies to/from
5577 // the AdvSIMD registers are cheap.
5578 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
5579 // CNT V0.8B, V0.8B // 8xbyte pop-counts
5580 // ADDV B0, V0.8B // sum 8xbyte pop-counts
5581 // UMOV X0, V0.B[0] // copy byte result back to integer reg
5582 SDValue Val = Op.getOperand(0);
5583 SDLoc DL(Op);
5584 EVT VT = Op.getValueType();
5585
5586 if (VT == MVT::i32 || VT == MVT::i64) {
5587 if (VT == MVT::i32)
5588 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
5589 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
5590
5592 SDValue UaddLV = DAG.getNode(
5594 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
5595
5596 if (VT == MVT::i64)
5598 return UaddLV;
5599 } else if (VT == MVT::i128) {
5600 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
5601
5603 SDValue UaddLV = DAG.getNode(
5605 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
5606
5608 }
5609
5610 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
5611 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
5612 "Unexpected type for custom ctpop lowering");
5613
5615 Val = DAG.getBitcast(VT8Bit, Val);
5616 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
5617
5618 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
5619 unsigned EltSize = 8;
5620 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
5621 while (EltSize != VT.getScalarSizeInBits()) {
5622 EltSize *= 2;
5623 NumElts /= 2;
5625 Val = DAG.getNode(
5627 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
5628 }
5629
5630 return Val;
5631}
5632
5633SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
5634
5635 if (Op.getValueType().isVector())
5636 return LowerVSETCC(Op, DAG);
5637
5638 bool IsStrict = Op->isStrictFPOpcode();
5639 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
5640 unsigned OpNo = IsStrict ? 1 : 0;
5641 SDValue Chain;
5642 if (IsStrict)
5643 Chain = Op.getOperand(0);
5644 SDValue LHS = Op.getOperand(OpNo + 0);
5645 SDValue RHS = Op.getOperand(OpNo + 1);
5646 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
5647 SDLoc dl(Op);
5648
5649 // We chose ZeroOrOneBooleanContents, so use zero and one.
5650 EVT VT = Op.getValueType();
5651 SDValue TVal = DAG.getConstant(1, dl, VT);
5652 SDValue FVal = DAG.getConstant(0, dl, VT);
5653
5654 // Handle f128 first, since one possible outcome is a normal integer
5655 // comparison which gets picked up by the next if statement.
5656 if (LHS.getValueType() == MVT::f128) {
5657 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
5658 IsSignaling);
5659
5660 // If softenSetCCOperands returned a scalar, use it.
5661 if (!RHS.getNode()) {
5662 assert(LHS.getValueType() == Op.getValueType() &&
5663 "Unexpected setcc expansion!");
5664 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
5665 }
5666 }
5667
5668 if (LHS.getValueType().isInteger()) {
5669 SDValue CCVal;
5671 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
5672
5673 // Note that we inverted the condition above, so we reverse the order of
5674 // the true and false operands here. This will allow the setcc to be
5675 // matched to a single CSINC instruction.
5676 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
5677 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
5678 }
5679
5680 // Now we know we're dealing with FP values.
5681 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
5682 LHS.getValueType() == MVT::f64);
5683
5684 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
5685 // and do the comparison.
5686 SDValue Cmp;
5687 if (IsStrict)
5688 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
5689 else
5690 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
5691
5694 SDValue Res;
5695 if (CC2 == AArch64CC::AL) {
5697 CC2);
5699
5700 // Note that we inverted the condition above, so we reverse the order of
5701 // the true and false operands here. This will allow the setcc to be
5702 // matched to a single CSINC instruction.
5703 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
5704 } else {
5705 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
5706 // totally clean. Some of them require two CSELs to implement. As is in
5707 // this case, we emit the first CSEL and then emit a second using the output
5708 // of the first as the RHS. We're effectively OR'ing the two CC's together.
5709
5710 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
5712 SDValue CS1 =
5713 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
5714
5716 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
5717 }
5718 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
5719}
5720
5721SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
5722 SDValue RHS, SDValue TVal,
5723 SDValue FVal, const SDLoc &dl,
5724 SelectionDAG &DAG) const {
5725 // Handle f128 first, because it will result in a comparison of some RTLIB
5726 // call result against zero.
5727 if (LHS.getValueType() == MVT::f128) {
5728 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
5729
5730 // If softenSetCCOperands returned a scalar, we need to compare the result
5731 // against zero to select between true and false values.
5732 if (!RHS.getNode()) {
5733 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5734 CC = ISD::SETNE;
5735 }
5736 }
5737
5738 // Also handle f16, for which we need to do a f32 comparison.
5739 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5740 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
5741 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
5742 }
5743
5744 // Next, handle integers.
5745 if (LHS.getValueType().isInteger()) {
5746 assert((LHS.getValueType() == RHS.getValueType()) &&
5747 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
5748
5749 unsigned Opcode = AArch64ISD::CSEL;
5750
5751 // If both the TVal and the FVal are constants, see if we can swap them in
5752 // order to for a CSINV or CSINC out of them.
5755
5756 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
5759 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5760 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
5763 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5764 } else if (TVal.getOpcode() == ISD::XOR) {
5765 // If TVal is a NOT we want to swap TVal and FVal so that we can match
5766 // with a CSINV rather than a CSEL.
5767 if (isAllOnesConstant(TVal.getOperand(1))) {
5770 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5771 }
5772 } else if (TVal.getOpcode() == ISD::SUB) {
5773 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
5774 // that we can match with a CSNEG rather than a CSEL.
5775 if (isNullConstant(TVal.getOperand(0))) {
5778 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5779 }
5780 } else if (CTVal && CFVal) {
5781 const int64_t TrueVal = CTVal->getSExtValue();
5782 const int64_t FalseVal = CFVal->getSExtValue();
5783 bool Swap = false;
5784
5785 // If both TVal and FVal are constants, see if FVal is the
5786 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
5787 // instead of a CSEL in that case.
5788 if (TrueVal == ~FalseVal) {
5789 Opcode = AArch64ISD::CSINV;
5790 } else if (TrueVal == -FalseVal) {
5791 Opcode = AArch64ISD::CSNEG;
5792 } else if (TVal.getValueType() == MVT::i32) {
5793 // If our operands are only 32-bit wide, make sure we use 32-bit
5794 // arithmetic for the check whether we can use CSINC. This ensures that
5795 // the addition in the check will wrap around properly in case there is
5796 // an overflow (which would not be the case if we do the check with
5797 // 64-bit arithmetic).
5798 const uint32_t TrueVal32 = CTVal->getZExtValue();
5799 const uint32_t FalseVal32 = CFVal->getZExtValue();
5800
5801 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
5802 Opcode = AArch64ISD::CSINC;
5803
5804 if (TrueVal32 > FalseVal32) {
5805 Swap = true;
5806 }
5807 }
5808 // 64-bit check whether we can use CSINC.
5809 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
5810 Opcode = AArch64ISD::CSINC;
5811
5812 if (TrueVal > FalseVal) {
5813 Swap = true;
5814 }
5815 }
5816
5817 // Swap TVal and FVal if necessary.
5818 if (Swap) {
5821 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5822 }
5823
5824 if (Opcode != AArch64ISD::CSEL) {
5825 // Drop FVal since we can get its value by simply inverting/negating
5826 // TVal.
5827 FVal = TVal;
5828 }
5829 }
5830
5831 // Avoid materializing a constant when possible by reusing a known value in
5832 // a register. However, don't perform this optimization if the known value
5833 // is one, zero or negative one in the case of a CSEL. We can always
5834 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
5835 // FVal, respectively.
5837 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
5838 !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
5840 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
5841 // "a != C ? x : a" to avoid materializing C.
5842 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
5843 TVal = LHS;
5844 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
5845 FVal = LHS;
5846 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
5847 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
5848 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
5849 // avoid materializing C.
5851 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
5852 Opcode = AArch64ISD::CSINV;
5853 TVal = LHS;
5854 FVal = DAG.getConstant(0, dl, FVal.getValueType());
5855 }
5856 }
5857
5858 SDValue CCVal;
5859 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
5860 EVT VT = TVal.getValueType();
5861 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
5862 }
5863
5864 // Now we know we're dealing with FP values.
5865 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
5866 LHS.getValueType() == MVT::f64);
5867 assert(LHS.getValueType() == RHS.getValueType());
5868 EVT VT = TVal.getValueType();
5869 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
5870
5871 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
5872 // clean. Some of them require two CSELs to implement.
5875
5876 if (DAG.getTarget().Options.UnsafeFPMath) {
5877 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
5878 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
5880 if (RHSVal && RHSVal->isZero()) {
5883
5884 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
5885 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
5886 TVal = LHS;
5887 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
5888 CFVal && CFVal->isZero() &&
5889 FVal.getValueType() == LHS.getValueType())
5890 FVal = LHS;
5891 }
5892 }
5893
5894 // Emit first, and possibly only, CSEL.
5896 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
5897
5898 // If we need a second CSEL, emit it, using the output of the first as the
5899 // RHS. We're effectively OR'ing the two CC's together.
5900 if (CC2 != AArch64CC::AL) {
5902 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
5903 }
5904
5905 // Otherwise, return the output of the first CSEL.
5906 return CS1;
5907}
5908
5909SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
5910 SelectionDAG &DAG) const {
5911 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5912 SDValue LHS = Op.getOperand(0);
5913 SDValue RHS = Op.getOperand(1);
5914 SDValue TVal = Op.getOperand(2);
5915 SDValue FVal = Op.getOperand(3);
5916 SDLoc DL(Op);
5917 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
5918}
5919
5920SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
5921 SelectionDAG &DAG) const {
5922 SDValue CCVal = Op->getOperand(0);
5923 SDValue TVal = Op->getOperand(1);
5924 SDValue FVal = Op->getOperand(2);
5925 SDLoc DL(Op);
5926
5927 EVT Ty = Op.getValueType();
5928 if (Ty.isScalableVector()) {
5930 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
5932 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
5933 }
5934
5935 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
5936 // instruction.
5937 if (ISD::isOverflowIntrOpRes(CCVal)) {
5938 // Only lower legal XALUO ops.
5939 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
5940 return SDValue();
5941
5944 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
5945 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
5946
5947 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
5948 CCVal, Overflow);
5949 }
5950
5951 // Lower it the same way as we would lower a SELECT_CC node.
5952 ISD::CondCode CC;
5953 SDValue LHS, RHS;
5954 if (CCVal.getOpcode() == ISD::SETCC) {
5955 LHS = CCVal.getOperand(0);
5956 RHS = CCVal.getOperand(1);
5957 CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
5958 } else {
5959 LHS = CCVal;
5960 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
5961 CC = ISD::SETNE;
5962 }
5963 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
5964}
5965
5966SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
5967 SelectionDAG &DAG) const {
5968 // Jump table entries as PC relative offsets. No additional tweaking
5969 // is necessary here. Just get the address of the jump table.
5971
5973 !Subtarget->isTargetMachO()) {
5974 return getAddrLarge(JT, DAG);
5975 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
5976 return getAddrTiny(JT, DAG);
5977 }
5978 return getAddr(JT, DAG);
5979}
5980
5981SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
5982 SelectionDAG &DAG) const {
5983 // Jump table entries as PC relative offsets. No additional tweaking
5984 // is necessary here. Just get the address of the jump table.
5985 SDLoc DL(Op);
5986 SDValue JT = Op.getOperand(1);
5987 SDValue Entry = Op.getOperand(2);
5988 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
5989
5990 SDNode *Dest =
5991 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
5992 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
5993 return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
5994 SDValue(Dest, 0));
5995}
5996
5997SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
5998 SelectionDAG &DAG) const {
6000
6002 // Use the GOT for the large code model on iOS.
6003 if (Subtarget->isTargetMachO()) {
6004 return getGOT(CP, DAG);
6005 }
6006 return getAddrLarge(CP, DAG);
6007 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
6008 return getAddrTiny(CP, DAG);
6009 } else {
6010 return getAddr(CP, DAG);
6011 }
6012}
6013
6014SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
6015 SelectionDAG &DAG) const {
6018 !Subtarget->isTargetMachO()) {
6019 return getAddrLarge(BA, DAG);
6020 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
6021 return getAddrTiny(BA, DAG);
6022 }
6023 return getAddr(BA, DAG);
6024}
6025
6026SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
6027 SelectionDAG &DAG) const {
6028 AArch64FunctionInfo *FuncInfo =
6030
6031 SDLoc DL(Op);
6032 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
6035 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6036 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
6037 MachinePointerInfo(SV));
6038}
6039
6040SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
6041 SelectionDAG &DAG) const {
6042 AArch64FunctionInfo *FuncInfo =
6044
6045 SDLoc DL(Op);
6046 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
6047 ? FuncInfo->getVarArgsGPRIndex()
6048 : FuncInfo->getVarArgsStackIndex(),
6049 getPointerTy(DAG.getDataLayout()));
6050 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6051 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
6052 MachinePointerInfo(SV));
6053}
6054
6055SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
6056 SelectionDAG &DAG) const {
6057 // The layout of the va_list struct is specified in the AArch64 Procedure Call
6058 // Standard, section B.3.
6061 auto PtrVT = getPointerTy(DAG.getDataLayout());
6062 SDLoc DL(Op);
6063
6064 SDValue Chain = Op.getOperand(0);
6065 SDValue VAList = Op.getOperand(1);
6066 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6068
6069 // void *__stack at offset 0
6071 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
6072 MachinePointerInfo(SV), /* Alignment = */ 8));
6073
6074 // void *__gr_top at offset 8
6075 int GPRSize = FuncInfo->getVarArgsGPRSize();
6076 if (GPRSize > 0) {
6078
6079 GRTopAddr =
6080 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
6081
6082 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
6084 DAG.getConstant(GPRSize, DL, PtrVT));
6085
6086 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
6087 MachinePointerInfo(SV, 8),
6088 /* Alignment = */ 8));
6089 }
6090
6091 // void *__vr_top at offset 16
6092 int FPRSize = FuncInfo->getVarArgsFPRSize();
6093 if (FPRSize > 0) {
6096 DAG.getConstant(16, DL, PtrVT));
6097
6098 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
6100 DAG.getConstant(FPRSize, DL, PtrVT));
6101
6102 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
6103 MachinePointerInfo(SV, 16),
6104 /* Alignment = */ 8));
6105 }
6106
6107 // int __gr_offs at offset 24
6109 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
6110 MemOps.push_back(DAG.getStore(
6111 Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
6112 MachinePointerInfo(SV, 24), /* Alignment = */ 4));
6113
6114 // int __vr_offs at offset 28
6116 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
6117 MemOps.push_back(DAG.getStore(
6118 Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
6119 MachinePointerInfo(SV, 28), /* Alignment = */ 4));
6120
6122}
6123
6124SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
6125 SelectionDAG &DAG) const {
6127
6128 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
6129 return LowerWin64_VASTART(Op, DAG);
6130 else if (Subtarget->isTargetDarwin())
6131 return LowerDarwin_VASTART(Op, DAG);
6132 else
6133 return LowerAAPCS_VASTART(Op, DAG);
6134}
6135
6136SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
6137 SelectionDAG &DAG) const {
6138 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
6139 // pointer.
6140 SDLoc DL(Op);
6141 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
6142 unsigned VaListSize = (Subtarget->isTargetDarwin() ||
6143 Subtarget->isTargetWindows()) ? PtrSize : 32;
6144 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
6145 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6146
6147 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
6149 Align(PtrSize), false, false, false,
6151}
6152
6153SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
6154 assert(Subtarget->isTargetDarwin() &&
6155 "automatic va_arg instruction only works on Darwin");
6156
6157 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6158 EVT VT = Op.getValueType();
6159 SDLoc DL(Op);
6160 SDValue Chain = Op.getOperand(0);
6161 SDValue Addr = Op.getOperand(1);
6162 MaybeAlign Align(Op.getConstantOperandVal(3));
6163 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
6164 auto PtrVT = getPointerTy(DAG.getDataLayout());
6166 SDValue VAList =
6167 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
6168 Chain = VAList.getValue(1);
6170
6171 if (Align && *Align > MinSlotSize) {
6173 DAG.getConstant(Align->value() - 1, DL, PtrVT));
6175 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
6176 }
6177
6178 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
6179 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
6180
6181 // Scalar integer and FP values smaller than 64 bits are implicitly extended
6182 // up to 64 bits. At the very least, we have to increase the striding of the
6183 // vaargs list to match this, and for FP values we need to introduce
6184 // FP_ROUND nodes as well.
6185 if (VT.isInteger() && !VT.isVector())
6186 ArgSize = std::max(ArgSize, MinSlotSize);
6187 bool NeedFPTrunc = false;
6188 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
6189 ArgSize = 8;
6190 NeedFPTrunc = true;
6191 }
6192
6193 // Increment the pointer, VAList, to the next vaarg
6195 DAG.getConstant(ArgSize, DL, PtrVT));
6197
6198 // Store the incremented VAList to the legalized pointer
6200 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
6201
6202 // Load the actual argument out of the pointer VAList
6203 if (NeedFPTrunc) {
6204 // Load the value as an f64.
6205 SDValue WideFP =
6207 // Round the value down to an f32.
6208 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
6209 DAG.getIntPtrConstant(1, DL));
6210 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
6211 // Merge the rounded value with the chain output of the load.
6212 return DAG.getMergeValues(Ops, DL);
6213 }
6214
6215 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
6216}
6217
6218SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
6219 SelectionDAG &DAG) const {
6221 MFI.setFrameAddressIsTaken(true);
6222
6223 EVT VT = Op.getValueType();
6224 SDLoc DL(Op);
6225 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6227 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
6228 while (Depth--)
6229 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
6231
6232 if (Subtarget->isTargetILP32())
6234 DAG.getValueType(VT));
6235
6236 return FrameAddr;
6237}
6238
6239SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
6240 SelectionDAG &DAG) const {
6242
6243 EVT VT = getPointerTy(DAG.getDataLayout());
6244 SDLoc DL(Op);
6245 int FI = MFI.CreateFixedObject(4, 0, false);
6246 return DAG.getFrameIndex(FI, VT);
6247}
6248
6249#define GET_REGISTER_MATCHER
6250#include "AArch64GenAsmMatcher.inc"
6251
6252// FIXME? Maybe this could be a TableGen attribute on some registers and
6253// this table could be generated automatically from RegInfo.
6254Register AArch64TargetLowering::
6255getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
6257 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
6258 const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
6259 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
6260 if (!Subtarget->isXRegisterReserved(DwarfRegNum))
6261 Reg = 0;
6262 }
6263 if (Reg)
6264 return Reg;
6265 report_fatal_error(Twine("Invalid register name \""
6266 + StringRef(RegName) + "\"."));
6267}
6268
6269SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
6270 SelectionDAG &DAG) const {
6272
6273 EVT VT = Op.getValueType();
6274 SDLoc DL(Op);
6275
6277 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
6279
6280 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
6281}
6282
6283SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
6284 SelectionDAG &DAG) const {
6286 MachineFrameInfo &MFI = MF.getFrameInfo();
6287 MFI.setReturnAddressIsTaken(true);
6288
6289 EVT VT = Op.getValueType();
6290 SDLoc DL(Op);
6291 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6292 if (Depth) {
6293 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6295 return DAG.getLoad(VT, DL, DAG.getEntryNode(),
6296 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
6298 }
6299
6300 // Return LR, which contains the return address. Mark it an implicit live-in.
6301 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
6302 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6303}
6304
6305/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6306/// i64 values and take a 2 x i64 value to shift plus a shift amount.
6307SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
6308 SelectionDAG &DAG) const {
6309 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6310 EVT VT = Op.getValueType();
6311 unsigned VTBits = VT.getSizeInBits();
6312 SDLoc dl(Op);
6313 SDValue ShOpLo = Op.getOperand(0);
6314 SDValue ShOpHi = Op.getOperand(1);
6315 SDValue ShAmt = Op.getOperand(2);
6316 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6317
6318 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6319
6321 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
6323
6324 // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
6325 // is "undef". We wanted 0, so CSEL it directly.
6327 ISD::SETEQ, dl, DAG);
6328 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
6329 HiBitsForLo =
6330 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
6331 HiBitsForLo, CCVal, Cmp);
6332
6334 DAG.getConstant(VTBits, dl, MVT::i64));
6335
6338 DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
6339
6341 dl, DAG);
6342 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
6343 SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6345 LoForNormalShift, CCVal, Cmp);
6346
6347 // AArch64 shifts larger than the register width are wrapped rather than
6348 // clamped, so we can't just emit "hi >> x".
6349 SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6351 Opc == ISD::SRA
6352 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6353 DAG.getConstant(VTBits - 1, dl, MVT::i64))
6354 : DAG.getConstant(0, dl, VT);
6356 HiForNormalShift, CCVal, Cmp);
6357
6358 SDValue Ops[2] = { Lo, Hi };
6359 return DAG.getMergeValues(Ops, dl);
6360}
6361
6362/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6363/// i64 values and take a 2 x i64 value to shift plus a shift amount.
6364SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
6365 SelectionDAG &DAG) const {
6366 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6367 EVT VT = Op.getValueType();
6368 unsigned VTBits = VT.getSizeInBits();
6369 SDLoc dl(Op);
6370 SDValue ShOpLo = Op.getOperand(0);
6371 SDValue ShOpHi = Op.getOperand(1);
6372 SDValue ShAmt = Op.getOperand(2);
6373
6374 assert(Op.getOpcode() == ISD::SHL_PARTS);
6376 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
6378
6379 // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
6380 // is "undef". We wanted 0, so CSEL it directly.
6382 ISD::SETEQ, dl, DAG);
6383 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
6384 LoBitsForHi =
6385 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
6386 LoBitsForHi, CCVal, Cmp);
6387
6389 DAG.getConstant(VTBits, dl, MVT::i64));
6392 DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
6393
6395
6397 dl, DAG);
6398 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
6400 HiForNormalShift, CCVal, Cmp);
6401
6402 // AArch64 shifts of larger than register sizes are wrapped rather than
6403 // clamped, so we can't just emit "lo << a" if a is too big.
6404 SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
6407 LoForNormalShift, CCVal, Cmp);
6408
6409 SDValue Ops[2] = { Lo, Hi };
6410 return DAG.getMergeValues(Ops, dl);
6411}
6412
6414 const GlobalAddressSDNode *GA) const {
6415 // Offsets are folded in the DAG combine rather than here so that we can
6416 // intelligently choose an offset based on the uses.
6417 return false;
6418}
6419
6421 bool OptForSize) const {
6422 bool IsLegal = false;
6423 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
6424 // 16-bit case when target has full fp16 support.
6425 // FIXME: We should be able to handle f128 as well with a clever lowering.
6426 const APInt ImmInt = Imm.bitcastToAPInt();
6427 if (VT == MVT::f64)
6429 else if (VT == MVT::f32)
6431 else if (VT == MVT::f16 && Subtarget->hasFullFP16())
6433 // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
6434 // generate that fmov.
6435
6436 // If we can not materialize in immediate field for fmov, check if the
6437 // value can be encoded as the immediate operand of a logical instruction.
6438 // The immediate value will be created with either MOVZ, MOVN, or ORR.
6439 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
6440 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
6441 // however the mov+fmov sequence is always better because of the reduced
6442 // cache pressure. The timings are still the same if you consider
6443 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
6444 // movw+movk is fused). So we limit up to 2 instrdduction at most.
6446 AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
6447 Insn);
6448 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
6449 IsLegal = Insn.size() <= Limit;
6450 }
6451
6452 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
6453 << " imm value: "; Imm.dump(););
6454 return IsLegal;
6455}
6456
6457//===----------------------------------------------------------------------===//
6458// AArch64 Optimization Hooks
6459//===----------------------------------------------------------------------===//
6460
6461static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
6462 SDValue Operand, SelectionDAG &DAG,
6463 int &ExtraSteps) {
6464 EVT VT = Operand.getValueType();
6465 if (ST->hasNEON() &&
6466 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
6467 VT == MVT::f32 || VT == MVT::v1f32 ||
6468 VT == MVT::v2f32 || VT == MVT::v4f32)) {
6470 // For the reciprocal estimates, convergence is quadratic, so the number
6471 // of digits is doubled after each iteration. In ARMv8, the accuracy of
6472 // the initial estimate is 2^-8. Thus the number of extra steps to refine
6473 // the result for float (23 mantissa bits) is 2 and for double (52
6474 // mantissa bits) is 3.
6475 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
6476
6477 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
6478 }
6479
6480 return SDValue();
6481}
6482
6483SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
6484 SelectionDAG &DAG, int Enabled,
6485 int &ExtraSteps,
6486 bool &UseOneConst,
6487 bool Reciprocal) const {
6490 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
6491 DAG, ExtraSteps)) {
6492 SDLoc DL(Operand);
6493 EVT VT = Operand.getValueType();
6494
6495 SDNodeFlags Flags;
6496 Flags.setAllowReassociation(true);
6497
6498 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
6499 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
6500 for (int i = ExtraSteps; i > 0; --i) {
6501 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
6502 Flags);
6503 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
6504 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
6505 }
6506 if (!Reciprocal) {
6508 VT);
6509 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
6510 SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
6511
6512 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
6513 // Correct the result if the operand is 0.0.
6514 Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
6515 VT, Eq, Operand, Estimate);
6516 }
6517
6518 ExtraSteps = 0;
6519 return Estimate;
6520 }
6521
6522 return SDValue();
6523}
6524
6525SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
6526 SelectionDAG &DAG, int Enabled,
6527 int &ExtraSteps) const {
6529 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
6530 DAG, ExtraSteps)) {
6531 SDLoc DL(Operand);
6532 EVT VT = Operand.getValueType();
6533
6534 SDNodeFlags Flags;
6535 Flags.setAllowReassociation(true);
6536
6537 // Newton reciprocal iteration: E * (2 - X * E)
6538 // AArch64 reciprocal iteration instruction: (2 - M * N)
6539 for (int i = ExtraSteps; i > 0; --i) {
6540 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
6541 Estimate, Flags);
6542 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
6543 }
6544
6545 ExtraSteps = 0;
6546 return Estimate;
6547 }
6548
6549 return SDValue();
6550}
6551
6552//===----------------------------------------------------------------------===//
6553// AArch64 Inline Assembly Support
6554//===----------------------------------------------------------------------===//
6555
6556// Table of Constraints
6557// TODO: This is the current set of constraints supported by ARM for the
6558// compiler, not all of them may make sense.
6559//
6560// r - A general register
6561// w - An FP/SIMD register of some size in the range v0-v31
6562// x - An FP/SIMD register of some size in the range v0-v15
6563// I - Constant that can be used with an ADD instruction
6564// J - Constant that can be used with a SUB instruction
6565// K - Constant that can be used with a 32-bit logical instruction
6566// L - Constant that can be used with a 64-bit logical instruction
6567// M - Constant that can be used as a 32-bit MOV immediate
6568// N - Constant that can be used as a 64-bit MOV immediate
6569// Q - A memory reference with base register and no offset
6570// S - A symbolic address
6571// Y - Floating point constant zero
6572// Z - Integer constant zero
6573//
6574// Note that general register operands will be output using their 64-bit x
6575// register name, whatever the size of the variable, unless the asm operand
6576// is prefixed by the %w modifier. Floating-point and SIMD register operands
6577// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
6578// %q modifier.
6579const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
6580 // At this point, we have to lower this constraint to something else, so we
6581 // lower it to an "r" or "w". However, by doing this we will force the result
6582 // to be in register, while the X constraint is much more permissive.
6583 //
6584 // Although we are correct (we are free to emit anything, without
6585 // constraints), we might break use cases that would expect us to be more
6586 // efficient and emit something else.
6587 if (!Subtarget->hasFPARMv8())
6588 return "r";
6589
6590 if (ConstraintVT.isFloatingPoint())
6591 return "w";
6592
6593 if (ConstraintVT.isVector() &&
6594 (ConstraintVT.getSizeInBits() == 64 ||
6595 ConstraintVT.getSizeInBits() == 128))
6596 return "w";
6597
6598 return "r";
6599}
6600
6606
6609 if (Constraint == "Upa")
6611 if (Constraint == "Upl")
6613 return P;
6614}
6615
6616/// getConstraintType - Given a constraint letter, return the type of
6617/// constraint it is for this target.
6619AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
6620 if (Constraint.size() == 1) {
6621 switch (Constraint[0]) {
6622 default:
6623 break;
6624 case 'x':
6625 case 'w':
6626 case 'y':
6627 return C_RegisterClass;
6628 // An address with a single base register. Due to the way we
6629 // currently handle addresses it is the same as 'r'.
6630 case 'Q':
6631 return C_Memory;
6632 case 'I':
6633 case 'J':
6634 case 'K':
6635 case 'L':
6636 case 'M':
6637 case 'N':
6638 case 'Y':
6639 case 'Z':
6640 return C_Immediate;
6641 case 'z':
6642 case 'S': // A symbolic address
6643 return C_Other;
6644 }
6645 } else if (parsePredicateConstraint(Constraint) !=
6647 return C_RegisterClass;
6648 return TargetLowering::getConstraintType(Constraint);
6649}
6650
6651/// Examine constraint type and operand type and determine a weight value.
6652/// This object must already have been set up with the operand type
6653/// and the current alternative constraint selected.
6655AArch64TargetLowering::getSingleConstraintMatchWeight(
6656 AsmOperandInfo &info, const char *constraint) const {
6658 Value *CallOperandVal = info.CallOperandVal;
6659 // If we don't have a value, we can't do a match,
6660 // but allow it at the lowest weight.
6661 if (!CallOperandVal)
6662 return CW_Default;
6663 Type *type = CallOperandVal->getType();
6664 // Look at the constraint type.
6665 switch (*constraint) {
6666 default:
6668 break;
6669 case 'x':
6670 case 'w':
6671 case 'y':
6672 if (type->isFloatingPointTy() || type->isVectorTy())
6673 weight = CW_Register;
6674 break;
6675 case 'z':
6676 weight = CW_Constant;
6677 break;
6678 case 'U':
6680 weight = CW_Register;
6681 break;
6682 }
6683 return weight;
6684}
6685
6686std::pair<unsigned, const TargetRegisterClass *>
6687AArch64TargetLowering::getRegForInlineAsmConstraint(
6688 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
6689 if (Constraint.size() == 1) {
6690 switch (Constraint[0]) {
6691 case 'r':
6692 if (VT.getSizeInBits() == 64)
6693 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
6694 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
6695 case 'w':
6696 if (!Subtarget->hasFPARMv8())
6697 break;
6698 if (VT.isScalableVector())
6699 return std::make_pair(0U, &AArch64::ZPRRegClass);
6700 if (VT.getSizeInBits() == 16)
6701 return std::make_pair(0U, &AArch64::FPR16RegClass);
6702 if (VT.getSizeInBits() == 32)
6703 return std::make_pair(0U, &AArch64::FPR32RegClass);
6704 if (VT.getSizeInBits() == 64)
6705 return std::make_pair(0U, &AArch64::FPR64RegClass);
6706 if (VT.getSizeInBits() == 128)
6707 return std::make_pair(0U, &AArch64::FPR128RegClass);
6708 break;
6709 // The instructions that this constraint is designed for can
6710 // only take 128-bit registers so just use that regclass.
6711 case 'x':
6712 if (!Subtarget->hasFPARMv8())
6713 break;
6714 if (VT.isScalableVector())
6715 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
6716 if (VT.getSizeInBits() == 128)
6717 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
6718 break;
6719 case 'y':
6720 if (!Subtarget->hasFPARMv8())
6721 break;
6722 if (VT.isScalableVector())
6723 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
6724 break;
6725 }
6726 } else {
6728 if (PC != PredicateConstraint::Invalid) {
6730 bool restricted = (PC == PredicateConstraint::Upl);
6731 return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
6732 : std::make_pair(0U, &AArch64::PPRRegClass);
6733 }
6734 }
6735 if (StringRef("{cc}").equals_lower(Constraint))
6736 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
6737
6738 // Use the default implementation in TargetLowering to convert the register
6739 // constraint into a member of a register class.
6740 std::pair<unsigned, const TargetRegisterClass *> Res;
6742
6743 // Not found as a standard register?
6744 if (!Res.second) {
6745 unsigned Size = Constraint.size();
6746 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
6747 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
6748 int RegNo;
6749 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
6750 if (!Failed && RegNo >= 0 && RegNo <= 31) {
6751 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
6752 // By default we'll emit v0-v31 for this unless there's a modifier where
6753 // we'll emit the correct register as well.
6754 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
6755 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
6756 Res.second = &AArch64::FPR64RegClass;
6757 } else {
6758 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
6759 Res.second = &AArch64::FPR128RegClass;
6760 }
6761 }
6762 }
6763 }
6764
6765 if (Res.second && !Subtarget->hasFPARMv8() &&
6766 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
6767 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
6768 return std::make_pair(0U, nullptr);
6769
6770 return Res;
6771}
6772
6773/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
6774/// vector. If it is invalid, don't add anything to Ops.
6775void AArch64TargetLowering::LowerAsmOperandForConstraint(
6776 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
6777 SelectionDAG &DAG) const {
6779
6780 // Currently only support length 1 constraints.
6781 if (Constraint.length() != 1)
6782 return;
6783
6784 char ConstraintLetter = Constraint[0];
6785 switch (ConstraintLetter) {
6786 default:
6787 break;
6788
6789 // This set of constraints deal with valid constants for various instructions.
6790 // Validate and return a target constant for them if we can.
6791 case 'z': {
6792 // 'z' maps to xzr or wzr so it needs an input of 0.
6793 if (!isNullConstant(Op))
6794 return;
6795
6796 if (Op.getValueType() == MVT::i64)
6797 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
6798 else
6799 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
6800 break;
6801 }
6802 case 'S': {
6803 // An absolute symbolic address or label reference.
6805 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
6806 GA->getValueType(0));
6807 } else if (const BlockAddressSDNode *BA =
6809 Result =
6811 } else if (const ExternalSymbolSDNode *ES =
6813 Result =
6814 DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
6815 } else
6816 return;
6817 break;
6818 }
6819
6820 case 'I':
6821 case 'J':
6822 case 'K':
6823 case 'L':
6824 case 'M':
6825 case 'N':
6827 if (!C)
6828 return;
6829
6830 // Grab the value and do some validation.
6831 uint64_t CVal = C->getZExtValue();
6832 switch (ConstraintLetter) {
6833 // The I constraint applies only to simple ADD or SUB immediate operands:
6834 // i.e. 0 to 4095 with optional shift by 12
6835 // The J constraint applies only to ADD or SUB immediates that would be
6836 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
6837 // instruction [or vice versa], in other words -1 to -4095 with optional
6838 // left shift by 12.
6839 case 'I':
6840 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
6841 break;
6842 return;
6843 case 'J': {
6844 uint64_t NVal = -C->getSExtValue();
6846 CVal = C->getSExtValue();
6847 break;
6848 }
6849 return;
6850 }
6851 // The K and L constraints apply *only* to logical immediates, including
6852 // what used to be the MOVI alias for ORR (though the MOVI alias has now
6853 // been removed and MOV should be used). So these constraints have to
6854 // distinguish between bit patterns that are valid 32-bit or 64-bit
6855 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
6856 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
6857 // versa.
6858 case 'K':
6859 if (AArch64_AM::isLogicalImmediate(CVal, 32))
6860 break;
6861 return;
6862 case 'L':
6863 if (AArch64_AM::isLogicalImmediate(CVal, 64))
6864 break;
6865 return;
6866 // The M and N constraints are a superset of K and L respectively, for use
6867 // with the MOV (immediate) alias. As well as the logical immediates they
6868 // also match 32 or 64-bit immediates that can be loaded either using a
6869 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
6870 // (M) or 64-bit 0x1234000000000000 (N) etc.
6871 // As a note some of this code is liberally stolen from the asm parser.
6872 case 'M': {
6873 if (!isUInt<32>(CVal))
6874 return;
6875 if (AArch64_AM::isLogicalImmediate(CVal, 32))
6876 break;
6877 if ((CVal & 0xFFFF) == CVal)
6878 break;
6879 if ((CVal & 0xFFFF0000ULL) == CVal)
6880 break;
6881 uint64_t NCVal = ~(uint32_t)CVal;
6882 if ((NCVal & 0xFFFFULL) == NCVal)
6883 break;
6884 if ((NCVal & 0xFFFF0000ULL) == NCVal)
6885 break;
6886 return;
6887 }
6888 case 'N': {
6889 if (AArch64_AM::isLogicalImmediate(CVal, 64))
6890 break;
6891 if ((CVal & 0xFFFFULL) == CVal)
6892 break;
6893 if ((CVal & 0xFFFF0000ULL) == CVal)
6894 break;
6895 if ((CVal & 0xFFFF00000000ULL) == CVal)
6896 break;
6897 if ((CVal & 0xFFFF000000000000ULL) == CVal)
6898 break;
6899 uint64_t NCVal = ~CVal;
6900 if ((NCVal & 0xFFFFULL) == NCVal)
6901 break;
6902 if ((NCVal & 0xFFFF0000ULL) == NCVal)
6903 break;
6904 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
6905 break;
6906 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
6907 break;
6908 return;
6909 }
6910 default:
6911 return;
6912 }
6913
6914 // All assembler immediates are 64-bit integers.
6915 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
6916 break;
6917 }
6918
6919 if (Result.getNode()) {
6920 Ops.push_back(Result);
6921 return;
6922 }
6923
6924 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
6925}
6926
6927//===----------------------------------------------------------------------===//
6928// AArch64 Advanced SIMD Support
6929//===----------------------------------------------------------------------===//
6930
6931/// WidenVector - Given a value in the V64 register class, produce the
6932/// equivalent value in the V128 register class.
6934 EVT VT = V64Reg.getValueType();
6935 unsigned NarrowSize = VT.getVectorNumElements();
6936 MVT EltTy = VT.getVectorElementType().getSimpleVT();
6937 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
6938 SDLoc DL(V64Reg);
6939
6941 V64Reg, DAG.getConstant(0, DL, MVT::i32));
6942}
6943
6944/// getExtFactor - Determine the adjustment factor for the position when
6945/// generating an "extract from vector registers" instruction.
6946static unsigned getExtFactor(SDValue &V) {
6947 EVT EltType = V.getValueType().getVectorElementType();
6948 return EltType.getSizeInBits() / 8;
6949}
6950
6951/// NarrowVector - Given a value in the V128 register class, produce the
6952/// equivalent value in the V64 register class.
6954 EVT VT = V128Reg.getValueType();
6955 unsigned WideSize = VT.getVectorNumElements();
6956 MVT EltTy = VT.getVectorElementType().getSimpleVT();
6957 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
6958 SDLoc DL(V128Reg);
6959
6960 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
6961}
6962
6963// Gather data to see if the operation can be modelled as a
6964// shuffle in combination with VEXTs.
6966 SelectionDAG &DAG) const {
6967 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
6968 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
6969 SDLoc dl(Op);
6970 EVT VT = Op.getValueType();
6971 unsigned NumElts = VT.getVectorNumElements();
6972
6973 struct ShuffleSourceInfo {
6974 SDValue Vec;
6975 unsigned MinElt;
6976 unsigned MaxElt;
6977
6978 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
6979 // be compatible with the shuffle we intend to construct. As a result
6980 // ShuffleVec will be some sliding window into the original Vec.
6982
6983 // Code should guarantee that element i in Vec starts at element "WindowBase
6984 // + i * WindowScale in ShuffleVec".
6985 int WindowBase;
6986 int WindowScale;
6987
6989 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
6990 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
6991
6992 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
6993 };
6994
6995 // First gather all vectors used as an immediate source for this BUILD_VECTOR
6996 // node.
6998 for (unsigned i = 0; i < NumElts; ++i) {
6999 SDValue V = Op.getOperand(i);
7000 if (V.isUndef())
7001 continue;
7002 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7003 !isa<ConstantSDNode>(V.getOperand(1))) {
7004 LLVM_DEBUG(
7005 dbgs() << "Reshuffle failed: "
7006 "a shuffle can only come from building a vector from "
7007 "various elements of other vectors, provided their "
7008 "indices are constant\n");
7009 return SDValue();
7010 }
7011
7012 // Add this element source to the list if it's not already there.
7013 SDValue SourceVec = V.getOperand(0);
7014 auto Source = find(Sources, SourceVec);
7015 if (Source == Sources.end())
7016 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7017
7018 // Update the minimum and maximum lane number seen.
7019 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
7020 Source->MinElt = std::min(Source->MinElt, EltNo);
7021 Source->MaxElt = std::max(Source->MaxElt, EltNo);
7022 }
7023
7024 if (Sources.size() > 2) {
7025 LLVM_DEBUG(
7026 dbgs() << "Reshuffle failed: currently only do something sane when at "
7027 "most two source vectors are involved\n");
7028 return SDValue();
7029 }
7030
7031 // Find out the smallest element size among result and two sources, and use
7032 // it as element size to build the shuffle_vector.
7034 for (auto &Source : Sources) {
7035 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
7036 if (SrcEltTy.bitsLT(SmallestEltTy)) {
7038 }
7039 }
7040 unsigned ResMultiplier =
7041 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
7042 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
7044
7045 // If the source vector is too wide or too narrow, we may nevertheless be able
7046 // to construct a compatible shuffle either by concatenating it with UNDEF or
7047 // extracting a suitable range of elements.
7048 for (auto &Src : Sources) {
7049 EVT SrcVT = Src.ShuffleVec.getValueType();
7050
7051 if (SrcVT.getSizeInBits() == VT.getSizeInBits())
7052 continue;
7053
7054 // This stage of the search produces a source with the same element type as
7055 // the original, but with a total width matching the BUILD_VECTOR output.
7056 EVT EltVT = SrcVT.getVectorElementType();
7057 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
7059
7060 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
7061 assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
7062 // We can pad out the smaller vector for free, so if it's part of a
7063 // shuffle...
7064 Src.ShuffleVec =
7065 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
7066 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
7067 continue;
7068 }
7069
7070 assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
7071
7072 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
7073 LLVM_DEBUG(
7074 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
7075 return SDValue();
7076 }
7077
7078 if (Src.MinElt >= NumSrcElts) {
7079 // The extraction can just take the second half
7080 Src.ShuffleVec =
7081 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7082 DAG.getConstant(NumSrcElts, dl, MVT::i64));
7083 Src.WindowBase = -NumSrcElts;
7084 } else if (Src.MaxElt < NumSrcElts) {
7085 // The extraction can just take the first half
7086 Src.ShuffleVec =
7087 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7088 DAG.getConstant(0, dl, MVT::i64));
7089 } else {
7090 // An actual VEXT is needed
7092 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7093 DAG.getConstant(0, dl, MVT::i64));
7095 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7096 DAG.getConstant(NumSrcElts, dl, MVT::i64));
7097 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
7098
7099 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
7100 VEXTSrc2,
7101 DAG.getConstant(Imm, dl, MVT::i32));
7102 Src.WindowBase = -Src.MinElt;
7103 }
7104 }
7105
7106 // Another possible incompatibility occurs from the vector element types. We
7107 // can fix this by bitcasting the source vectors to the same type we intend
7108 // for the shuffle.
7109 for (auto &Src : Sources) {
7110 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
7111 if (SrcEltTy == SmallestEltTy)
7112 continue;
7113 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
7114 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
7115 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
7116 Src.WindowBase *= Src.WindowScale;
7117 }
7118
7119 // Final sanity check before we try to actually produce a shuffle.
7120 LLVM_DEBUG(for (auto Src
7121 : Sources)
7122 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
7123
7124 // The stars all align, our next step is to produce the mask for the shuffle.
7125 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
7126 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
7127 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
7128 SDValue Entry = Op.getOperand(i);
7129 if (Entry.isUndef())
7130 continue;
7131
7132 auto Src = find(Sources, Entry.getOperand(0));
7133 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
7134
7135 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
7136 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
7137 // segment.
7138 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
7139 int BitsDefined =
7140 std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
7142
7143 // This source is expected to fill ResMultiplier lanes of the final shuffle,
7144 // starting at the appropriate offset.
7145 int *LaneMask = &Mask[i * ResMultiplier];
7146
7147 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
7148 ExtractBase += NumElts * (Src - Sources.begin());
7149 for (int j = 0; j < LanesDefined; ++j)
7150 LaneMask[j] = ExtractBase + j;
7151 }
7152
7153 // Final check before we try to produce nonsense...
7154 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
7155 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
7156 return SDValue();
7157 }
7158
7160 for (unsigned i = 0; i < Sources.size(); ++i)
7161 ShuffleOps[i] = Sources[i].ShuffleVec;
7162
7163 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
7164 ShuffleOps[1], Mask);
7165 SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
7166
7167 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
7168 dbgs() << "Reshuffle, creating node: "; V.dump(););
7169
7170 return V;
7171}
7172
7173// check if an EXT instruction can handle the shuffle mask when the
7174// vector sources of the shuffle are the same.
7175static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7176 unsigned NumElts = VT.getVectorNumElements();
7177
7178 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7179 if (M[0] < 0)
7180 return false;
7181
7182 Imm = M[0];
7183
7184 // If this is a VEXT shuffle, the immediate value is the index of the first
7185 // element. The other shuffle indices must be the successive elements after
7186 // the first one.
7187 unsigned ExpectedElt = Imm;
7188 for (unsigned i = 1; i < NumElts; ++i) {
7189 // Increment the expected index. If it wraps around, just follow it
7190 // back to index zero and keep going.
7191 ++ExpectedElt;
7192 if (ExpectedElt == NumElts)
7193 ExpectedElt = 0;
7194
7195 if (M[i] < 0)
7196 continue; // ignore UNDEF indices
7197 if (ExpectedElt != static_cast<unsigned>(M[i]))
7198 return false;
7199 }
7200
7201 return true;
7202}
7203
7204// check if an EXT instruction can handle the shuffle mask when the
7205// vector sources of the shuffle are different.
7206static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
7207 unsigned &Imm) {
7208 // Look for the first non-undef element.
7209 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
7210
7211 // Benefit form APInt to handle overflow when calculating expected element.
7212 unsigned NumElts = VT.getVectorNumElements();
7213 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
7215 // The following shuffle indices must be the successive elements after the
7216 // first real element.
7217 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
7218 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
7219 if (FirstWrongElt != M.end())
7220 return false;
7221
7222 // The index of an EXT is the first element if it is not UNDEF.
7223 // Watch out for the beginning UNDEFs. The EXT index should be the expected
7224 // value of the first element. E.g.
7225 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
7226 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
7227 // ExpectedElt is the last mask index plus 1.
7228 Imm = ExpectedElt.getZExtValue();
7229
7230 // There are two difference cases requiring to reverse input vectors.
7231 // For example, for vector <4 x i32> we have the following cases,
7232 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
7233 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
7234 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
7235 // to reverse two input vectors.
7236 if (Imm < NumElts)
7237 ReverseEXT = true;
7238 else
7239 Imm -= NumElts;
7240
7241 return true;
7242}
7243
7244/// isREVMask - Check if a vector shuffle corresponds to a REV
7245/// instruction with the specified blocksize. (The order of the elements
7246/// within each block of the vector is reversed.)
7247static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
7248 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
7249 "Only possible block sizes for REV are: 16, 32, 64");
7250
7251 unsigned EltSz = VT.getScalarSizeInBits();
7252 if (EltSz == 64)
7253 return false;
7254
7255 unsigned NumElts = VT.getVectorNumElements();
7256 unsigned BlockElts = M[0] + 1;
7257 // If the first shuffle index is UNDEF, be optimistic.
7258 if (M[0] < 0)
7260
7261 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
7262 return false;
7263
7264 for (unsigned i = 0; i < NumElts; ++i) {
7265 if (M[i] < 0)
7266 continue; // ignore UNDEF indices
7267 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
7268 return false;
7269 }
7270
7271 return true;
7272}
7273
7274static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7275 unsigned NumElts = VT.getVectorNumElements();
7276 if (NumElts % 2 != 0)
7277 return false;
7278 WhichResult = (M[0] == 0 ? 0 : 1);
7279 unsigned Idx = WhichResult * NumElts / 2;
7280 for (unsigned i = 0; i != NumElts; i += 2) {
7281 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
7282 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
7283 return false;
7284 Idx += 1;
7285 }
7286
7287 return true;
7288}
7289
7290static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7291 unsigned NumElts = VT.getVectorNumElements();
7292 WhichResult = (M[0] == 0 ? 0 : 1);
7293 for (unsigned i = 0; i != NumElts; ++i) {
7294 if (M[i] < 0)
7295 continue; // ignore UNDEF indices
7296 if ((unsigned)M[i] != 2 * i + WhichResult)
7297 return false;
7298 }
7299
7300 return true;
7301}
7302
7303static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7304 unsigned NumElts = VT.getVectorNumElements();
7305 if (NumElts % 2 != 0)
7306 return false;
7307 WhichResult = (M[0] == 0 ? 0 : 1);
7308 for (unsigned i = 0; i < NumElts; i += 2) {
7309 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
7310 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
7311 return false;
7312 }
7313 return true;
7314}
7315
7316/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
7317/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7318/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7319static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7320 unsigned NumElts = VT.getVectorNumElements();
7321 if (NumElts % 2 != 0)
7322 return false;
7323 WhichResult = (M[0] == 0 ? 0 : 1);
7324 unsigned Idx = WhichResult * NumElts / 2;
7325 for (unsigned i = 0; i != NumElts; i += 2) {
7326 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
7327 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
7328 return false;
7329 Idx += 1;
7330 }
7331
7332 return true;
7333}
7334
7335/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
7336/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7337/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7338static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7339 unsigned Half = VT.getVectorNumElements() / 2;
7340 WhichResult = (M[0] == 0 ? 0 : 1);
7341 for (unsigned j = 0; j != 2; ++j) {
7342 unsigned Idx = WhichResult;
7343 for (unsigned i = 0; i != Half; ++i) {
7344 int MIdx = M[i + j * Half];
7345 if (MIdx >= 0 && (unsigned)MIdx != Idx)
7346 return false;
7347 Idx += 2;
7348 }
7349 }
7350
7351 return true;
7352}
7353
7354/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
7355/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7356/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7357static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7358 unsigned NumElts = VT.getVectorNumElements();
7359 if (NumElts % 2 != 0)
7360 return false;
7361 WhichResult = (M[0] == 0 ? 0 : 1);
7362 for (unsigned i = 0; i < NumElts; i += 2) {
7363 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
7364 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
7365 return false;
7366 }
7367 return true;
7368}
7369
7371 bool &DstIsLeft, int &Anomaly) {
7372 if (M.size() != static_cast<size_t>(NumInputElements))
7373 return false;
7374
7375 int NumLHSMatch = 0, NumRHSMatch = 0;
7376 int LastLHSMismatch = -1, LastRHSMismatch = -1;
7377
7378 for (int i = 0; i < NumInputElements; ++i) {
7379 if (M[i] == -1) {
7380 ++NumLHSMatch;
7381 ++NumRHSMatch;
7382 continue;
7383 }
7384
7385 if (M[i] == i)
7386 ++NumLHSMatch;
7387 else
7389
7390 if (M[i] == i + NumInputElements)
7391 ++NumRHSMatch;
7392 else
7394 }
7395
7396 if (NumLHSMatch == NumInputElements - 1) {
7397 DstIsLeft = true;
7399 return true;
7400 } else if (NumRHSMatch == NumInputElements - 1) {
7401 DstIsLeft = false;
7403 return true;
7404 }
7405
7406 return false;
7407}
7408
7409static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
7410 if (VT.getSizeInBits() != 128)
7411 return false;
7412
7413 unsigned NumElts = VT.getVectorNumElements();
7414
7415 for (int I = 0, E = NumElts / 2; I != E; I++) {
7416 if (Mask[I] != I)
7417 return false;
7418 }
7419
7420 int Offset = NumElts / 2;
7421 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
7422 if (Mask[I] != I + SplitLHS * Offset)
7423 return false;
7424 }
7425
7426 return true;
7427}
7428
7430 SDLoc DL(Op);
7431 EVT VT = Op.getValueType();
7432 SDValue V0 = Op.getOperand(0);
7433 SDValue V1 = Op.getOperand(1);
7434 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
7435
7436 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
7437 VT.getVectorElementType() != V1.getValueType().getVectorElementType())
7438 return SDValue();
7439
7440 bool SplitV0 = V0.getValueSizeInBits() == 128;
7441
7442 if (!isConcatMask(Mask, VT, SplitV0))
7443 return SDValue();
7444
7446 if (SplitV0) {
7448 DAG.getConstant(0, DL, MVT::i64));
7449 }
7450 if (V1.getValueSizeInBits() == 128) {
7452 DAG.getConstant(0, DL, MVT::i64));
7453 }
7454 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
7455}
7456
7457/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
7458/// the specified operations to build the shuffle.
7460 SDValue RHS, SelectionDAG &DAG,
7461 const SDLoc &dl) {
7462 unsigned OpNum = (PFEntry >> 26) & 0x0F;
7463 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
7464 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
7465
7466 enum {
7467 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
7468 OP_VREV,
7469 OP_VDUP0,
7470 OP_VDUP1,
7471 OP_VDUP2,
7472 OP_VDUP3,
7473 OP_VEXT1,
7474 OP_VEXT2,
7475 OP_VEXT3,
7476 OP_VUZPL, // VUZP, left result
7477 OP_VUZPR, // VUZP, right result
7478 OP_VZIPL, // VZIP, left result
7479 OP_VZIPR, // VZIP, right result
7480 OP_VTRNL, // VTRN, left result
7481 OP_VTRNR // VTRN, right result
7482 };
7483
7484 if (OpNum == OP_COPY) {
7485 if (LHSID == (1 * 9 + 2) * 9 + 3)
7486 return LHS;
7487 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
7488 return RHS;
7489 }
7490
7494 EVT VT = OpLHS.getValueType();
7495
7496 switch (OpNum) {
7497 default:
7498 llvm_unreachable("Unknown shuffle opcode!");
7499 case OP_VREV:
7500 // VREV divides the vector in half and swaps within the half.
7501 if (VT.getVectorElementType() == MVT::i32 ||
7503 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
7504 // vrev <4 x i16> -> REV32
7505 if (VT.getVectorElementType() == MVT::i16 ||
7508 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
7509 // vrev <4 x i8> -> REV16
7511 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
7512 case OP_VDUP0:
7513 case OP_VDUP1:
7514 case OP_VDUP2:
7515 case OP_VDUP3: {
7516 EVT EltTy = VT.getVectorElementType();
7517 unsigned Opcode;
7518 if (EltTy == MVT::i8)
7519 Opcode = AArch64ISD::DUPLANE8;
7520 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
7521 Opcode = AArch64ISD::DUPLANE16;
7522 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
7523 Opcode = AArch64ISD::DUPLANE32;
7524 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
7525 Opcode = AArch64ISD::DUPLANE64;
7526 else
7527 llvm_unreachable("Invalid vector element type?");
7528
7529 if (VT.getSizeInBits() == 64)
7530 OpLHS = WidenVector(OpLHS, DAG);
7531 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
7532 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
7533 }
7534 case OP_VEXT1:
7535 case OP_VEXT2:
7536 case OP_VEXT3: {
7537 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
7538 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
7539 DAG.getConstant(Imm, dl, MVT::i32));
7540 }
7541 case OP_VUZPL:
7542 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
7543 OpRHS);
7544 case OP_VUZPR:
7545 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
7546 OpRHS);
7547 case OP_VZIPL:
7548 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
7549 OpRHS);
7550 case OP_VZIPR:
7551 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
7552 OpRHS);
7553 case OP_VTRNL:
7554 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
7555 OpRHS);
7556 case OP_VTRNR:
7557 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
7558 OpRHS);
7559 }
7560}
7561
7563 SelectionDAG &DAG) {
7564 // Check to see if we can use the TBL instruction.
7565 SDValue V1 = Op.getOperand(0);
7566 SDValue V2 = Op.getOperand(1);
7567 SDLoc DL(Op);
7568
7569 EVT EltVT = Op.getValueType().getVectorElementType();
7570 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
7571
7573 for (int Val : ShuffleMask) {
7574 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
7575 unsigned Offset = Byte + Val * BytesPerElt;
7576 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
7577 }
7578 }
7579
7581 unsigned IndexLen = 8;
7582 if (Op.getValueSizeInBits() == 128) {
7584 IndexLen = 16;
7585 }
7586
7589
7590 SDValue Shuffle;
7591 if (V2.getNode()->isUndef()) {
7592 if (IndexLen == 8)
7594 Shuffle = DAG.getNode(
7596 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
7598 makeArrayRef(TBLMask.data(), IndexLen)));
7599 } else {
7600 if (IndexLen == 8) {
7602 Shuffle = DAG.getNode(
7604 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
7606 makeArrayRef(TBLMask.data(), IndexLen)));
7607 } else {
7608 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
7609 // cannot currently represent the register constraints on the input
7610 // table registers.
7611 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
7612 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
7613 // IndexLen));
7614 Shuffle = DAG.getNode(
7616 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
7618 makeArrayRef(TBLMask.data(), IndexLen)));
7619 }
7620 }
7621 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
7622}
7623
7624static unsigned getDUPLANEOp(EVT EltType) {
7625 if (EltType == MVT::i8)
7626 return AArch64ISD::DUPLANE8;
7627 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
7628 return AArch64ISD::DUPLANE16;
7629 if (EltType == MVT::i32 || EltType == MVT::f32)
7630 return AArch64ISD::DUPLANE32;
7631 if (EltType == MVT::i64 || EltType == MVT::f64)
7632 return AArch64ISD::DUPLANE64;
7633
7634 llvm_unreachable("Invalid vector element type?");
7635}
7636
7637SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
7638 SelectionDAG &DAG) const {
7639 SDLoc dl(Op);
7640 EVT VT = Op.getValueType();
7641
7643
7644 // Convert shuffles that are directly supported on NEON to target-specific
7645 // DAG nodes, instead of keeping them as shuffles and matching them again
7646 // during code selection. This is more efficient and avoids the possibility
7647 // of inconsistencies between legalization and selection.
7648 ArrayRef<int> ShuffleMask = SVN->getMask();
7649
7650 SDValue V1 = Op.getOperand(0);
7651 SDValue V2 = Op.getOperand(1);
7652
7653 if (SVN->isSplat()) {
7654 int Lane = SVN->getSplatIndex();
7655 // If this is undef splat, generate it via "just" vdup, if possible.
7656 if (Lane == -1)
7657 Lane = 0;
7658
7659 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
7660 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
7661 V1.getOperand(0));
7662 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
7663 // constant. If so, we can just reference the lane's definition directly.
7664 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
7665 !isa<ConstantSDNode>(V1.getOperand(Lane)))
7666 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
7667
7668 // Otherwise, duplicate from the lane of the input vector.
7669 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
7670
7671 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
7672 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
7673 // Match: dup (bitcast (extract_subv X, C)), LaneC
7674 if (BitCast.getOpcode() != ISD::BITCAST ||
7675 BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
7676 return false;
7677
7678 // The extract index must align in the destination type. That may not
7679 // happen if the bitcast is from narrow to wide type.
7680 SDValue Extract = BitCast.getOperand(0);
7681 unsigned ExtIdx = Extract.getConstantOperandVal(1);
7682 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
7683 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
7684 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
7686 return false;
7687
7688 // Update the lane value by offsetting with the scaled extract index.
7690
7691 // Determine the casted vector type of the wide vector input.
7692 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
7693 // Examples:
7694 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
7695 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
7696 unsigned SrcVecNumElts =
7698 CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
7700 return true;
7701 };
7702 MVT CastVT;
7703 if (getScaledOffsetDup(V1, Lane, CastVT)) {
7704 V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0));
7705 } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7706 // The lane is incremented by the index of the extract.
7707 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
7708 Lane += V1.getConstantOperandVal(1);
7709 V1 = V1.getOperand(0);
7710 } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
7711 // The lane is decremented if we are splatting from the 2nd operand.
7712 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
7713 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
7714 Lane -= Idx * VT.getVectorNumElements() / 2;
7715 V1 = WidenVector(V1.getOperand(Idx), DAG);
7716 } else if (VT.getSizeInBits() == 64) {
7717 // Widen the operand to 128-bit register with undef.
7718 V1 = WidenVector(V1, DAG);
7719 }
7720 return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
7721 }
7722
7723 if (isREVMask(ShuffleMask, VT, 64))
7724 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
7725 if (isREVMask(ShuffleMask, VT, 32))
7726 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
7727 if (isREVMask(ShuffleMask, VT, 16))
7728 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
7729
7730 bool ReverseEXT = false;
7731 unsigned Imm;
7732 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
7733 if (ReverseEXT)
7734 std::swap(V1, V2);
7735 Imm *= getExtFactor(V1);
7736 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
7737 DAG.getConstant(Imm, dl, MVT::i32));
7738 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
7739 Imm *= getExtFactor(V1);
7740 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
7741 DAG.getConstant(Imm, dl, MVT::i32));
7742 }
7743
7744 unsigned WhichResult;
7745 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
7746 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
7747 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
7748 }
7749 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
7750 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
7751 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
7752 }
7753 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
7754 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
7755 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
7756 }
7757
7758 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
7759 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
7760 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
7761 }
7762 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
7763 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
7764 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
7765 }
7766 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
7767 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
7768 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
7769 }
7770
7772 return Concat;
7773
7774 bool DstIsLeft;
7775 int Anomaly;
7776 int NumInputElements = V1.getValueType().getVectorNumElements();
7777 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
7780
7781 SDValue SrcVec = V1;
7782 int SrcLane = ShuffleMask[Anomaly];
7783 if (SrcLane >= NumInputElements) {
7784 SrcVec = V2;
7786 }
7788
7790
7791 if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
7793
7794 return DAG.getNode(
7797 DstLaneV);
7798 }
7799
7800 // If the shuffle is not directly supported and it has 4 elements, use
7801 // the PerfectShuffle-generated table to synthesize it from other shuffles.
7802 unsigned NumElts = VT.getVectorNumElements();
7803 if (NumElts == 4) {
7804 unsigned PFIndexes[4];
7805 for (unsigned i = 0; i != 4; ++i) {
7806 if (ShuffleMask[i] < 0)
7807 PFIndexes[i] = 8;
7808 else
7809 PFIndexes[i] = ShuffleMask[i];
7810 }
7811
7812 // Compute the index in the perfect shuffle table.
7813 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
7814 PFIndexes[2] * 9 + PFIndexes[3];
7816 unsigned Cost = (PFEntry >> 30);
7817
7818 if (Cost <= 4)
7819 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
7820 }
7821
7822 return GenerateTBL(Op, ShuffleMask, DAG);
7823}
7824
7825SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
7826 SelectionDAG &DAG) const {
7827 SDLoc dl(Op);
7828 EVT VT = Op.getValueType();
7829 EVT ElemVT = VT.getScalarType();
7830
7831 SDValue SplatVal = Op.getOperand(0);
7832
7833 // Extend input splat value where needed to fit into a GPR (32b or 64b only)
7834 // FPRs don't have this restriction.
7835 switch (ElemVT.getSimpleVT().SimpleTy) {
7836 case MVT::i1: {
7837 // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
7838 // lowering code.
7839 if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
7840 if (ConstVal->isOne())
7841 return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
7842 // TODO: Add special case for constant false
7843 }
7844 // The general case of i1. There isn't any natural way to do this,
7845 // so we use some trickery with whilelo.
7848 DAG.getValueType(MVT::i1));
7849 SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
7850 MVT::i64);
7851 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
7852 DAG.getConstant(0, dl, MVT::i64), SplatVal);
7853 }
7854 case MVT::i8:
7855 case MVT::i16:
7856 case MVT::i32:
7858 break;
7859 case MVT::i64:
7861 break;
7862 case MVT::f16:
7863 case MVT::bf16:
7864 case MVT::f32:
7865 case MVT::f64:
7866 // Fine as is
7867 break;
7868 default:
7869 report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
7870 }
7871
7872 return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
7873}
7874
7875SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
7876 SelectionDAG &DAG) const {
7877 SDLoc DL(Op);
7878
7879 EVT VT = Op.getValueType();
7880 if (!isTypeLegal(VT) || !VT.isScalableVector())
7881 return SDValue();
7882
7883 // Current lowering only supports the SVE-ACLE types.
7885 return SDValue();
7886
7887 // The DUPQ operation is indepedent of element type so normalise to i64s.
7888 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
7889 SDValue Idx128 = Op.getOperand(2);
7890
7891 // DUPQ can be used when idx is in range.
7893 if (CIdx && (CIdx->getZExtValue() <= 3)) {
7894 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
7895 SDNode *DUPQ =
7896 DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
7897 return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
7898 }
7899
7900 // The ACLE says this must produce the same result as:
7901 // svtbl(data, svadd_x(svptrue_b64(),
7902 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
7903 // index * 2))
7904 SDValue One = DAG.getConstant(1, DL, MVT::i64);
7906
7907 // create the vector 0,1,0,1,...
7908 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7910 DL, MVT::nxv2i64, Zero, One);
7911 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
7912
7913 // create the vector idx64,idx64+1,idx64,idx64+1,...
7916 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
7917
7918 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
7919 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
7920 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
7921}
7922
7923
7925 APInt &UndefBits) {
7926 EVT VT = BVN->getValueType(0);
7927 APInt SplatBits, SplatUndef;
7928 unsigned SplatBitSize;
7929 bool HasAnyUndefs;
7930 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7931 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
7932
7933 for (unsigned i = 0; i < NumSplats; ++i) {
7934 CnstBits <<= SplatBitSize;
7935 UndefBits <<= SplatBitSize;
7936 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
7937 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
7938 }
7939
7940 return true;
7941 }
7942
7943 return false;
7944}
7945
7946// Try 64-bit splatted SIMD immediate.
7948 const APInt &Bits) {
7949 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
7950 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
7951 EVT VT = Op.getValueType();
7952 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
7953
7956
7957 SDLoc dl(Op);
7958 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
7959 DAG.getConstant(Value, dl, MVT::i32));
7960 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
7961 }
7962 }
7963
7964 return SDValue();
7965}
7966
7967// Try 32-bit splatted SIMD immediate.
7969 const APInt &Bits,
7970 const SDValue *LHS = nullptr) {
7971 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
7972 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
7973 EVT VT = Op.getValueType();
7974 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
7975 bool isAdvSIMDModImm = false;
7976 uint64_t Shift;
7977
7980 Shift = 0;
7981 }
7984 Shift = 8;
7985 }
7988 Shift = 16;
7989 }
7992 Shift = 24;
7993 }
7994
7995 if (isAdvSIMDModImm) {
7996 SDLoc dl(Op);
7997 SDValue Mov;
7998
7999 if (LHS)
8000 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
8001 DAG.getConstant(Value, dl, MVT::i32),
8002 DAG.getConstant(Shift, dl, MVT::i32));
8003 else
8004 Mov = DAG.getNode(NewOp, dl, MovTy,
8005 DAG.getConstant(Value, dl, MVT::i32),
8006 DAG.getConstant(Shift, dl, MVT::i32));
8007
8008 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
8009 }
8010 }
8011
8012 return SDValue();
8013}
8014
8015// Try 16-bit splatted SIMD immediate.
8017 const APInt &Bits,
8018 const SDValue *LHS = nullptr) {
8019 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
8020 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
8021 EVT VT = Op.getValueType();
8022 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
8023 bool isAdvSIMDModImm = false;
8024 uint64_t Shift;
8025
8028 Shift = 0;
8029 }
8032 Shift = 8;
8033 }
8034
8035 if (isAdvSIMDModImm) {
8036 SDLoc dl(Op);
8037 SDValue Mov;
8038
8039 if (LHS)
8040 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
8041 DAG.getConstant(Value, dl, MVT::i32),
8042 DAG.getConstant(Shift, dl, MVT::i32));
8043 else
8044 Mov = DAG.getNode(NewOp, dl, MovTy,
8045 DAG.getConstant(Value, dl, MVT::i32),
8046 DAG.getConstant(Shift, dl, MVT::i32));
8047
8048 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
8049 }
8050 }
8051
8052 return SDValue();
8053}
8054
8055// Try 32-bit splatted SIMD immediate with shifted ones.
8057 SelectionDAG &DAG, const APInt &Bits) {
8058 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
8059 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
8060 EVT VT = Op.getValueType();
8061 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
8062 bool isAdvSIMDModImm = false;
8063 uint64_t Shift;
8064
8067 Shift = 264;
8068 }
8071 Shift = 272;
8072 }
8073
8074 if (isAdvSIMDModImm) {
8075 SDLoc dl(Op);
8076 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
8077 DAG.getConstant(Value, dl, MVT::i32),
8078 DAG.getConstant(Shift, dl, MVT::i32));
8079 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
8080 }
8081 }
8082
8083 return SDValue();
8084}
8085
8086// Try 8-bit splatted SIMD immediate.
8088 const APInt &Bits) {
8089 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
8090 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
8091 EVT VT = Op.getValueType();
8092 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
8093
8096
8097 SDLoc dl(Op);
8098 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
8099 DAG.getConstant(Value, dl, MVT::i32));
8100 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
8101 }
8102 }
8103
8104 return SDValue();
8105}
8106
8107// Try FP splatted SIMD immediate.
8109 const APInt &Bits) {
8110 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
8111 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
8112 EVT VT = Op.getValueType();
8113 bool isWide = (VT.getSizeInBits() == 128);
8114 MVT MovTy;
8115 bool isAdvSIMDModImm = false;
8116
8120 }
8121 else if (isWide &&
8124 MovTy = MVT::v2f64;
8125 }
8126
8127 if (isAdvSIMDModImm) {
8128 SDLoc dl(Op);
8129 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
8130 DAG.getConstant(Value, dl, MVT::i32));
8131 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
8132 }
8133 }
8134
8135 return SDValue();
8136}
8137
8138// Specialized code to quickly find if PotentialBVec is a BuildVector that
8139// consists of only the same constant int value, returned in reference arg
8140// ConstVal
8142 uint64_t &ConstVal) {
8144 if (!Bvec)
8145 return false;
8147 if (!FirstElt)
8148 return false;
8149 EVT VT = Bvec->getValueType(0);
8150 unsigned NumElts = VT.getVectorNumElements();
8151 for (unsigned i = 1; i < NumElts; ++i)
8152 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
8153 return false;
8154 ConstVal = FirstElt->getZExtValue();
8155 return true;
8156}
8157
8158static unsigned getIntrinsicID(const SDNode *N) {
8159 unsigned Opcode = N->getOpcode();
8160 switch (Opcode) {
8161 default:
8164 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
8165 if (IID < Intrinsic::num_intrinsics)
8166 return IID;
8168 }
8169 }
8170}
8171
8172// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
8173// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
8174// BUILD_VECTORs with constant element C1, C2 is a constant, and:
8175// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
8176// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
8177// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
8179 EVT VT = N->getValueType(0);
8180
8181 if (!VT.isVector())
8182 return SDValue();
8183
8184 SDLoc DL(N);
8185
8186 SDValue And;
8187 SDValue Shift;
8188
8189 SDValue FirstOp = N->getOperand(0);
8190 unsigned FirstOpc = FirstOp.getOpcode();
8191 SDValue SecondOp = N->getOperand(1);
8192 unsigned SecondOpc = SecondOp.getOpcode();
8193
8194 // Is one of the operands an AND or a BICi? The AND may have been optimised to
8195 // a BICi in order to use an immediate instead of a register.
8196 // Is the other operand an shl or lshr? This will have been turned into:
8197 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
8198 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
8200 And = FirstOp;
8201 Shift = SecondOp;
8202
8203 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
8205 And = SecondOp;
8206 Shift = FirstOp;
8207 } else
8208 return SDValue();
8209
8210 bool IsAnd = And.getOpcode() == ISD::AND;
8211 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
8212
8213 // Is the shift amount constant?
8215 if (!C2node)
8216 return SDValue();
8217
8218 uint64_t C1;
8219 if (IsAnd) {
8220 // Is the and mask vector all constant?
8221 if (!isAllConstantBuildVector(And.getOperand(1), C1))
8222 return SDValue();
8223 } else {
8224 // Reconstruct the corresponding AND immediate from the two BICi immediates.
8228 C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
8229 }
8230
8231 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
8232 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
8233 // how much one can shift elements of a particular size?
8234 uint64_t C2 = C2node->getZExtValue();
8235 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
8236 if (C2 > ElemSizeInBits)
8237 return SDValue();
8238
8242 if (C1AsAPInt != RequiredC1)
8243 return SDValue();
8244
8245 SDValue X = And.getOperand(0);
8246 SDValue Y = Shift.getOperand(0);
8247
8249 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
8250
8251 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
8252 LLVM_DEBUG(N->dump(&DAG));
8253 LLVM_DEBUG(dbgs() << "into: \n");
8254 LLVM_DEBUG(ResultSLI->dump(&DAG));
8255
8257 return ResultSLI;
8258}
8259
8260SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
8261 SelectionDAG &DAG) const {
8262 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
8263 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
8264 return Res;
8265
8266 EVT VT = Op.getValueType();
8267
8268 SDValue LHS = Op.getOperand(0);
8270 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
8271 if (!BVN) {
8272 // OR commutes, so try swapping the operands.
8273 LHS = Op.getOperand(1);
8274 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
8275 }
8276 if (!BVN)
8277 return Op;
8278
8279 APInt DefBits(VT.getSizeInBits(), 0);
8280 APInt UndefBits(VT.getSizeInBits(), 0);
8282 SDValue NewOp;
8283
8285 DefBits, &LHS)) ||
8287 DefBits, &LHS)))
8288 return NewOp;
8289
8291 UndefBits, &LHS)) ||
8293 UndefBits, &LHS)))
8294 return NewOp;
8295 }
8296
8297 // We can always fall back to a non-immediate OR.
8298 return Op;
8299}
8300
8301// Normalize the operands of BUILD_VECTOR. The value of constant operands will
8302// be truncated to fit element width.
8304 SelectionDAG &DAG) {
8305 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8306 SDLoc dl(Op);
8307 EVT VT = Op.getValueType();
8308 EVT EltTy= VT.getVectorElementType();
8309
8310 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
8311 return Op;
8312
8314 for (SDValue Lane : Op->ops()) {
8315 // For integer vectors, type legalization would have promoted the
8316 // operands already. Otherwise, if Op is a floating-point splat
8317 // (with operands cast to integers), then the only possibilities
8318 // are constants and UNDEFs.
8319 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
8320 APInt LowBits(EltTy.getSizeInBits(),
8321 CstLane->getZExtValue());
8322 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
8323 } else if (Lane.getNode()->isUndef()) {
8324 Lane = DAG.getUNDEF(MVT::i32);
8325 } else {
8326 assert(Lane.getValueType() == MVT::i32 &&
8327 "Unexpected BUILD_VECTOR operand type");
8328 }
8329 Ops.push_back(Lane);
8330 }
8331 return DAG.getBuildVector(VT, dl, Ops);
8332}
8333
8335 EVT VT = Op.getValueType();
8336
8337 APInt DefBits(VT.getSizeInBits(), 0);
8338 APInt UndefBits(VT.getSizeInBits(), 0);
8341 SDValue NewOp;
8348 return NewOp;
8349
8350 DefBits = ~DefBits;
8354 return NewOp;
8355
8363 return NewOp;
8364
8369 return NewOp;
8370 }
8371
8372 return SDValue();
8373}
8374
8375SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
8376 SelectionDAG &DAG) const {
8377 EVT VT = Op.getValueType();
8378
8379 // Try to build a simple constant vector.
8380 Op = NormalizeBuildVector(Op, DAG);
8381 if (VT.isInteger()) {
8382 // Certain vector constants, used to express things like logical NOT and
8383 // arithmetic NEG, are passed through unmodified. This allows special
8384 // patterns for these operations to match, which will lower these constants
8385 // to whatever is proven necessary.
8387 if (BVN->isConstant())
8388 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
8389 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
8390 APInt Val(BitSize,
8391 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
8392 if (Val.isNullValue() || Val.isAllOnesValue())
8393 return Op;
8394 }
8395 }
8396
8397 if (SDValue V = ConstantBuildVector(Op, DAG))
8398 return V;
8399
8400 // Scan through the operands to find some interesting properties we can
8401 // exploit:
8402 // 1) If only one value is used, we can use a DUP, or
8403 // 2) if only the low element is not undef, we can just insert that, or
8404 // 3) if only one constant value is used (w/ some non-constant lanes),
8405 // we can splat the constant value into the whole vector then fill
8406 // in the non-constant lanes.
8407 // 4) FIXME: If different constant values are used, but we can intelligently
8408 // select the values we'll be overwriting for the non-constant
8409 // lanes such that we can directly materialize the vector
8410 // some other way (MOVI, e.g.), we can be sneaky.
8411 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
8412 SDLoc dl(Op);
8413 unsigned NumElts = VT.getVectorNumElements();
8414 bool isOnlyLowElement = true;
8415 bool usesOnlyOneValue = true;
8416 bool usesOnlyOneConstantValue = true;
8417 bool isConstant = true;
8418 bool AllLanesExtractElt = true;
8419 unsigned NumConstantLanes = 0;
8420 SDValue Value;
8421 SDValue ConstantValue;
8422 for (unsigned i = 0; i < NumElts; ++i) {
8423 SDValue V = Op.getOperand(i);
8424 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8425 AllLanesExtractElt = false;
8426 if (V.isUndef())
8427 continue;
8428 if (i > 0)
8429 isOnlyLowElement = false;
8431 isConstant = false;
8432
8435 if (!ConstantValue.getNode())
8436 ConstantValue = V;
8437 else if (ConstantValue != V)
8439 }
8440
8441 if (!Value.getNode())
8442 Value = V;
8443 else if (V != Value)
8444 usesOnlyOneValue = false;
8445 }
8446
8447 if (!Value.getNode()) {
8448 LLVM_DEBUG(
8449 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
8450 return DAG.getUNDEF(VT);
8451 }
8452
8453 // Convert BUILD_VECTOR where all elements but the lowest are undef into
8454 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
8455 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
8457 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
8458 "SCALAR_TO_VECTOR node\n");
8459 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
8460 }
8461
8462 if (AllLanesExtractElt) {
8463 SDNode *Vector = nullptr;
8464 bool Even = false;
8465 bool Odd = false;
8466 // Check whether the extract elements match the Even pattern <0,2,4,...> or
8467 // the Odd pattern <1,3,5,...>.
8468 for (unsigned i = 0; i < NumElts; ++i) {
8469 SDValue V = Op.getOperand(i);
8470 const SDNode *N = V.getNode();
8471 if (!isa<ConstantSDNode>(N->getOperand(1)))
8472 break;
8473 SDValue N0 = N->getOperand(0);
8474
8475 // All elements are extracted from the same vector.
8476 if (!Vector) {
8477 Vector = N0.getNode();
8478 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
8479 // BUILD_VECTOR.
8480 if (VT.getVectorElementType() !=
8482 break;
8483 } else if (Vector != N0.getNode()) {
8484 Odd = false;
8485 Even = false;
8486 break;
8487 }
8488
8489 // Extracted values are either at Even indices <0,2,4,...> or at Odd
8490 // indices <1,3,5,...>.
8491 uint64_t Val = N->getConstantOperandVal(1);
8492 if (Val == 2 * i) {
8493 Even = true;
8494 continue;
8495 }
8496 if (Val - 1 == 2 * i) {
8497 Odd = true;
8498 continue;
8499 }
8500
8501 // Something does not match: abort.
8502 Odd = false;
8503 Even = false;
8504 break;
8505 }
8506 if (Even || Odd) {
8507 SDValue LHS =
8509 DAG.getConstant(0, dl, MVT::i64));
8510 SDValue RHS =
8512 DAG.getConstant(NumElts, dl, MVT::i64));
8513
8514 if (Even && !Odd)
8515 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
8516 RHS);
8517 if (Odd && !Even)
8518 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
8519 RHS);
8520 }
8521 }
8522
8523 // Use DUP for non-constant splats. For f32 constant splats, reduce to
8524 // i32 and try again.
8525 if (usesOnlyOneValue) {
8526 if (!isConstant) {
8527 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8528 Value.getValueType() != VT) {
8529 LLVM_DEBUG(
8530 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
8531 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
8532 }
8533
8534 // This is actually a DUPLANExx operation, which keeps everything vectory.
8535
8536 SDValue Lane = Value.getOperand(1);
8537 Value = Value.getOperand(0);
8538 if (Value.getValueSizeInBits() == 64) {
8539 LLVM_DEBUG(
8540 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
8541 "widening it\n");
8542 Value = WidenVector(Value, DAG);
8543 }
8544
8545 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
8546 return DAG.getNode(Opcode, dl, VT, Value, Lane);
8547 }
8548
8551 EVT EltTy = VT.getVectorElementType();
8552 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
8553 EltTy == MVT::f64) && "Unsupported floating-point vector type");
8554 LLVM_DEBUG(
8555 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
8556 "BITCASTS, and try again\n");
8557 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
8558 for (unsigned i = 0; i < NumElts; ++i)
8559 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
8560 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
8561 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
8562 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
8563 Val.dump(););
8564 Val = LowerBUILD_VECTOR(Val, DAG);
8565 if (Val.getNode())
8566 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8567 }
8568 }
8569
8570 // If there was only one constant value used and for more than one lane,
8571 // start by splatting that value, then replace the non-constant lanes. This
8572 // is better than the default, which will perform a separate initialization
8573 // for each lane.
8575 // Firstly, try to materialize the splat constant.
8576 SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
8577 Val = ConstantBuildVector(Vec, DAG);
8578 if (!Val) {
8579 // Otherwise, materialize the constant and splat it.
8580 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
8581 DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
8582 }
8583
8584 // Now insert the non-constant lanes.
8585 for (unsigned i = 0; i < NumElts; ++i) {
8586 SDValue V = Op.getOperand(i);
8587 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
8589 // Note that type legalization likely mucked about with the VT of the
8590 // source operand, so we may have to convert it here before inserting.
8591 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
8592 }
8593 return Val;
8594 }
8595
8596 // This will generate a load from the constant pool.
8597 if (isConstant) {
8598 LLVM_DEBUG(
8599 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
8600 "expansion\n");
8601 return SDValue();
8602 }
8603
8604 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
8605 if (NumElts >= 4) {
8606 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
8607 return shuffle;
8608 }
8609
8610 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
8611 // know the default expansion would otherwise fall back on something even
8612 // worse. For a vector with one or two non-undef values, that's
8613 // scalar_to_vector for the elements followed by a shuffle (provided the
8614 // shuffle is valid for the target) and materialization element by element
8615 // on the stack followed by a load for everything else.
8616 if (!isConstant && !usesOnlyOneValue) {
8617 LLVM_DEBUG(
8618 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
8619 "of INSERT_VECTOR_ELT\n");
8620
8621 SDValue Vec = DAG.getUNDEF(VT);
8622 SDValue Op0 = Op.getOperand(0);
8623 unsigned i = 0;
8624
8625 // Use SCALAR_TO_VECTOR for lane zero to
8626 // a) Avoid a RMW dependency on the full vector register, and
8627 // b) Allow the register coalescer to fold away the copy if the
8628 // value is already in an S or D register, and we're forced to emit an
8629 // INSERT_SUBREG that we can't fold anywhere.
8630 //
8631 // We also allow types like i8 and i16 which are illegal scalar but legal
8632 // vector element types. After type-legalization the inserted value is
8633 // extended (i32) and it is safe to cast them to the vector type by ignoring
8634 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
8635 if (!Op0.isUndef()) {
8636 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
8637 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
8638 ++i;
8639 }
8640 LLVM_DEBUG(if (i < NumElts) dbgs()
8641 << "Creating nodes for the other vector elements:\n";);
8642 for (; i < NumElts; ++i) {
8643 SDValue V = Op.getOperand(i);
8644 if (V.isUndef())
8645 continue;
8646 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
8647 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
8648 }
8649 return Vec;
8650 }
8651
8652 LLVM_DEBUG(
8653 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
8654 "better alternative\n");
8655 return SDValue();
8656}
8657
8658SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8659 SelectionDAG &DAG) const {
8660 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
8661
8662 // Check for non-constant or out of range lane.
8663 EVT VT = Op.getOperand(0).getValueType();
8664 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
8665 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
8666 return SDValue();
8667
8668
8669 // Insertion/extraction are legal for V128 types.
8670 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
8671 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
8672 VT == MVT::v8f16 || VT == MVT::v8bf16)
8673 return Op;
8674
8675 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
8676 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
8677 VT != MVT::v4bf16)
8678 return SDValue();
8679
8680 // For V64 types, we perform insertion by expanding the value
8681 // to a V128 type and perform the insertion on that.
8682 SDLoc DL(Op);
8683 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
8684 EVT WideTy = WideVec.getValueType();
8685
8687 Op.getOperand(1), Op.getOperand(2));
8688 // Re-narrow the resultant vector.
8689 return NarrowVector(Node, DAG);
8690}
8691
8692SDValue
8693AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
8694 SelectionDAG &DAG) const {
8695 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
8696
8697 // Check for non-constant or out of range lane.
8698 EVT VT = Op.getOperand(0).getValueType();
8699 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
8700 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
8701 return SDValue();
8702
8703
8704 // Insertion/extraction are legal for V128 types.
8705 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
8706 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
8707 VT == MVT::v8f16 || VT == MVT::v8bf16)
8708 return Op;
8709
8710 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
8711 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
8712 VT != MVT::v4bf16)
8713 return SDValue();
8714
8715 // For V64 types, we perform extraction by expanding the value
8716 // to a V128 type and perform the extraction on that.
8717 SDLoc DL(Op);
8718 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
8719 EVT WideTy = WideVec.getValueType();
8720
8721 EVT ExtrTy = WideTy.getVectorElementType();
8722 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
8723 ExtrTy = MVT::i32;
8724
8725 // For extractions, we just return the result directly.
8727 Op.getOperand(1));
8728}
8729
8730SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
8731 SelectionDAG &DAG) const {
8732 assert(Op.getValueType().isFixedLengthVector() &&
8733 "Only cases that extract a fixed length vector are supported!");
8734
8735 EVT InVT = Op.getOperand(0).getValueType();
8736 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
8737 unsigned Size = Op.getValueSizeInBits();
8738
8739 if (InVT.isScalableVector()) {
8740 // This will be matched by custom code during ISelDAGToDAG.
8741 if (Idx == 0 && isPackedVectorType(InVT, DAG))
8742 return Op;
8743
8744 return SDValue();
8745 }
8746
8747 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
8748 if (Idx == 0 && InVT.getSizeInBits() <= 128)
8749 return Op;
8750
8751 // If this is extracting the upper 64-bits of a 128-bit vector, we match
8752 // that directly.
8753 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64)
8754 return Op;
8755
8756 return SDValue();
8757}
8758
8759SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
8760 SelectionDAG &DAG) const {
8761 assert(Op.getValueType().isScalableVector() &&
8762 "Only expect to lower inserts into scalable vectors!");
8763
8764 EVT InVT = Op.getOperand(1).getValueType();
8765 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
8766
8767 // We don't have any patterns for scalable vector yet.
8768 if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT))
8769 return SDValue();
8770
8771 // This will be matched by custom code during ISelDAGToDAG.
8772 if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
8773 return Op;
8774
8775 return SDValue();
8776}
8777
8779 // Currently no fixed length shuffles that require SVE are legal.
8780 if (useSVEForFixedLengthVectorVT(VT))
8781 return false;
8782
8783 if (VT.getVectorNumElements() == 4 &&
8784 (VT.is128BitVector() || VT.is64BitVector())) {
8785 unsigned PFIndexes[4];
8786 for (unsigned i = 0; i != 4; ++i) {
8787 if (M[i] < 0)
8788 PFIndexes[i] = 8;
8789 else
8790 PFIndexes[i] = M[i];
8791 }
8792
8793 // Compute the index in the perfect shuffle table.
8794 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
8795 PFIndexes[2] * 9 + PFIndexes[3];
8797 unsigned Cost = (PFEntry >> 30);
8798
8799 if (Cost <= 4)
8800 return true;
8801 }
8802
8803 bool DummyBool;
8804 int DummyInt;
8805 unsigned DummyUnsigned;
8806
8807 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
8808 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
8810 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
8811 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
8812 isZIPMask(M, VT, DummyUnsigned) ||
8817 isConcatMask(M, VT, VT.getSizeInBits() == 128));
8818}
8819
8820/// getVShiftImm - Check if this is a valid build_vector for the immediate
8821/// operand of a vector shift operation, where all the elements of the
8822/// build_vector must have the same constant integer value.
8823static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
8824 // Ignore bit_converts.
8825 while (Op.getOpcode() == ISD::BITCAST)
8826 Op = Op.getOperand(0);
8828 APInt SplatBits, SplatUndef;
8829 unsigned SplatBitSize;
8830 bool HasAnyUndefs;
8831 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
8833 SplatBitSize > ElementBits)
8834 return false;
8835 Cnt = SplatBits.getSExtValue();
8836 return true;
8837}
8838
8839/// isVShiftLImm - Check if this is a valid build_vector for the immediate
8840/// operand of a vector shift left operation. That value must be in the range:
8841/// 0 <= Value < ElementBits for a left shift; or
8842/// 0 <= Value <= ElementBits for a long left shift.
8843static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
8844 assert(VT.isVector() && "vector shift count is not a vector type");
8845 int64_t ElementBits = VT.getScalarSizeInBits();
8846 if (!getVShiftImm(Op, ElementBits, Cnt))
8847 return false;
8848 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
8849}
8850
8851/// isVShiftRImm - Check if this is a valid build_vector for the immediate
8852/// operand of a vector shift right operation. The value must be in the range:
8853/// 1 <= Value <= ElementBits for a right shift; or
8854static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
8855 assert(VT.isVector() && "vector shift count is not a vector type");
8856 int64_t ElementBits = VT.getScalarSizeInBits();
8857 if (!getVShiftImm(Op, ElementBits, Cnt))
8858 return false;
8859 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
8860}
8861
8862// Attempt to form urhadd(OpA, OpB) from
8863// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)).
8864// The original form of this expression is
8865// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function
8866// is called the srl will have been lowered to AArch64ISD::VLSHR and the
8867// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)).
8868// This pass can also recognize a variant of this pattern that uses sign
8869// extension instead of zero extension and form a srhadd(OpA, OpB) from it.
8870SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
8871 SelectionDAG &DAG) const {
8872 EVT VT = Op.getValueType();
8873
8874 if (VT.getScalarType() == MVT::i1) {
8875 // Lower i1 truncate to `(x & 1) != 0`.
8876 SDLoc dl(Op);
8877 EVT OpVT = Op.getOperand(0).getValueType();
8878 SDValue Zero = DAG.getConstant(0, dl, OpVT);
8879 SDValue One = DAG.getConstant(1, dl, OpVT);
8880 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
8881 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
8882 }
8883
8884 if (!VT.isVector() || VT.isScalableVector())
8885 return Op;
8886
8887 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
8888 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
8889
8890 // Since we are looking for a right shift by a constant value of 1 and we are
8891 // operating on types at least 16 bits in length (sign/zero extended OpA and
8892 // OpB, which are at least 8 bits), it follows that the truncate will always
8893 // discard the shifted-in bit and therefore the right shift will be logical
8894 // regardless of the signedness of OpA and OpB.
8895 SDValue Shift = Op.getOperand(0);
8896 if (Shift.getOpcode() != AArch64ISD::VLSHR)
8897 return Op;
8898
8899 // Is the right shift using an immediate value of 1?
8900 uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
8901 if (ShiftAmount != 1)
8902 return Op;
8903
8904 SDValue Sub = Shift->getOperand(0);
8905 if (Sub.getOpcode() != ISD::SUB)
8906 return Op;
8907
8908 SDValue Xor = Sub.getOperand(1);
8909 if (Xor.getOpcode() != ISD::XOR)
8910 return Op;
8911
8912 SDValue ExtendOpA = Xor.getOperand(0);
8913 SDValue ExtendOpB = Sub.getOperand(0);
8914 unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
8915 unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
8916 if (!(ExtendOpAOpc == ExtendOpBOpc &&
8918 return Op;
8919
8920 // Is the result of the right shift being truncated to the same value type as
8921 // the original operands, OpA and OpB?
8922 SDValue OpA = ExtendOpA.getOperand(0);
8923 SDValue OpB = ExtendOpB.getOperand(0);
8924 EVT OpAVT = OpA.getValueType();
8925 assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
8926 if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
8927 return Op;
8928
8929 // Is the XOR using a constant amount of all ones in the right hand side?
8930 uint64_t C;
8932 return Op;
8933
8934 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
8937 return Op;
8938
8939 SDLoc DL(Op);
8943
8944 return ResultURHADD;
8945}
8946
8947SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
8948 SelectionDAG &DAG) const {
8949 EVT VT = Op.getValueType();
8950 SDLoc DL(Op);
8951 int64_t Cnt;
8952
8953 if (!Op.getOperand(1).getValueType().isVector())
8954 return Op;
8955 unsigned EltSize = VT.getScalarSizeInBits();
8956
8957 switch (Op.getOpcode()) {
8958 default:
8959 llvm_unreachable("unexpected shift opcode");
8960
8961 case ISD::SHL:
8962 if (VT.isScalableVector())
8963 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1);
8964
8965 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
8966 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
8967 DAG.getConstant(Cnt, DL, MVT::i32));
8968 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8969 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
8970 MVT::i32),
8971 Op.getOperand(0), Op.getOperand(1));
8972 case ISD::SRA:
8973 case ISD::SRL:
8974 if (VT.isScalableVector()) {
8975 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1
8977 return LowerToPredicatedOp(Op, DAG, Opc);
8978 }
8979
8980 // Right shift immediate
8981 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
8982 unsigned Opc =
8983 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
8984 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
8985 DAG.getConstant(Cnt, DL, MVT::i32));
8986 }
8987
8988 // Right shift register. Note, there is not a shift right register
8989 // instruction, but the shift left register instruction takes a signed
8990 // value, where negative numbers specify a right shift.
8991 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
8993 // negate the shift amount
8994 SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
8997 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
8998 NegShift);
8999 return NegShiftLeft;
9000 }
9001
9002 return SDValue();
9003}
9004
9006 AArch64CC::CondCode CC, bool NoNans, EVT VT,
9007 const SDLoc &dl, SelectionDAG &DAG) {
9008 EVT SrcVT = LHS.getValueType();
9009 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
9010 "function only supposed to emit natural comparisons");
9011
9013 APInt CnstBits(VT.getSizeInBits(), 0);
9014 APInt UndefBits(VT.getSizeInBits(), 0);
9016 bool IsZero = IsCnst && (CnstBits == 0);
9017
9018 if (SrcVT.getVectorElementType().isFloatingPoint()) {
9019 switch (CC) {
9020 default:
9021 return SDValue();
9022 case AArch64CC::NE: {
9023 SDValue Fcmeq;
9024 if (IsZero)
9025 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
9026 else
9027 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
9028 return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
9029 }
9030 case AArch64CC::EQ:
9031 if (IsZero)
9032 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
9033 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
9034 case AArch64CC::GE:
9035 if (IsZero)
9036 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
9037 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
9038 case AArch64CC::GT:
9039 if (IsZero)
9040 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
9041 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
9042 case AArch64CC::LS:
9043 if (IsZero)
9044 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
9045 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
9046 case AArch64CC::LT:
9047 if (!NoNans)
9048 return SDValue();
9049 // If we ignore NaNs then we can use to the MI implementation.
9051 case AArch64CC::MI:
9052 if (IsZero)
9053 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
9054 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
9055 }
9056 }
9057
9058 switch (CC) {
9059 default:
9060 return SDValue();
9061 case AArch64CC::NE: {
9062 SDValue Cmeq;
9063 if (IsZero)
9064 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
9065 else
9066 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
9067 return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
9068 }
9069 case AArch64CC::EQ:
9070 if (IsZero)
9071 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
9072 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
9073 case AArch64CC::GE:
9074 if (IsZero)
9075 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
9076 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
9077 case AArch64CC::GT:
9078 if (IsZero)
9079 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
9080 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
9081 case AArch64CC::LE:
9082 if (IsZero)
9083 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
9084 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
9085 case AArch64CC::LS:
9086 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
9087 case AArch64CC::LO:
9088 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
9089 case AArch64CC::LT:
9090 if (IsZero)
9091 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
9092 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
9093 case AArch64CC::HI:
9094 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
9095 case AArch64CC::HS:
9096 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
9097 }
9098}
9099
9100SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
9101 SelectionDAG &DAG) const {
9102 if (Op.getValueType().isScalableVector()) {
9103 if (Op.getOperand(0).getValueType().isFloatingPoint())
9104 return Op;
9105 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
9106 }
9107
9108 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
9109 SDValue LHS = Op.getOperand(0);
9110 SDValue RHS = Op.getOperand(1);
9112 SDLoc dl(Op);
9113
9115 assert(LHS.getValueType() == RHS.getValueType());
9117 SDValue Cmp =
9118 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
9119 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
9120 }
9121
9122 const bool FullFP16 =
9123 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
9124
9125 // Make v4f16 (only) fcmp operations utilise vector instructions
9126 // v8f16 support will be a litle more complicated
9128 if (LHS.getValueType().getVectorNumElements() == 4) {
9129 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
9130 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
9131 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
9133 CmpVT = MVT::v4i32;
9134 } else
9135 return SDValue();
9136 }
9137
9140
9141 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9142 // clean. Some of them require two branches to implement.
9144 bool ShouldInvert;
9146
9148 SDValue Cmp =
9149 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
9150 if (!Cmp.getNode())
9151 return SDValue();
9152
9153 if (CC2 != AArch64CC::AL) {
9154 SDValue Cmp2 =
9155 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
9156 if (!Cmp2.getNode())
9157 return SDValue();
9158
9159 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
9160 }
9161
9162 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
9163
9164 if (ShouldInvert)
9165 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
9166
9167 return Cmp;
9168}
9169
9171 SelectionDAG &DAG) {
9172 SDValue VecOp = ScalarOp.getOperand(0);
9173 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
9174 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
9175 DAG.getConstant(0, DL, MVT::i64));
9176}
9177
9178SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
9179 SelectionDAG &DAG) const {
9180 SDLoc dl(Op);
9181 switch (Op.getOpcode()) {
9182 case ISD::VECREDUCE_ADD:
9183 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
9185 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
9187 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
9189 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
9191 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
9192 case ISD::VECREDUCE_FMAX: {
9193 assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
9194 return DAG.getNode(
9195 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
9196 DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
9197 Op.getOperand(0));
9198 }
9199 case ISD::VECREDUCE_FMIN: {
9200 assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
9201 return DAG.getNode(
9202 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
9203 DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
9204 Op.getOperand(0));
9205 }
9206 default:
9207 llvm_unreachable("Unhandled reduction");
9208 }
9209}
9210
9211SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
9212 SelectionDAG &DAG) const {
9213 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
9214 if (!Subtarget.hasLSE())
9215 return SDValue();
9216
9217 // LSE has an atomic load-add instruction, but not a load-sub.
9218 SDLoc dl(Op);
9219 MVT VT = Op.getSimpleValueType();
9220 SDValue RHS = Op.getOperand(2);
9221 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
9222 RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
9223 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
9224 Op.getOperand(0), Op.getOperand(1), RHS,
9225 AN->getMemOperand());
9226}
9227
9228SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
9229 SelectionDAG &DAG) const {
9230 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
9231 if (!Subtarget.hasLSE())
9232 return SDValue();
9233
9234 // LSE has an atomic load-clear instruction, but not a load-and.
9235 SDLoc dl(Op);
9236 MVT VT = Op.getSimpleValueType();
9237 SDValue RHS = Op.getOperand(2);
9238 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
9239 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
9240 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
9241 Op.getOperand(0), Op.getOperand(1), RHS,
9242 AN->getMemOperand());
9243}
9244
9245SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
9246 SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
9247 SDLoc dl(Op);
9249 SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
9250
9251 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9252 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
9253 if (Subtarget->hasCustomCallingConv())
9254 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
9255
9256 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
9257 DAG.getConstant(4, dl, MVT::i64));
9258 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
9259 Chain =
9261 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
9262 DAG.getRegisterMask(Mask), Chain.getValue(1));
9263 // To match the actual intent better, we should read the output from X15 here
9264 // again (instead of potentially spilling it to the stack), but rereading Size
9265 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
9266 // here.
9267
9268 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
9269 DAG.getConstant(4, dl, MVT::i64));
9270 return Chain;
9271}
9272
9273SDValue
9274AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
9275 SelectionDAG &DAG) const {
9276 assert(Subtarget->isTargetWindows() &&
9277 "Only Windows alloca probing supported");
9278 SDLoc dl(Op);
9279 // Get the inputs.
9280 SDNode *Node = Op.getNode();
9281 SDValue Chain = Op.getOperand(0);
9282 SDValue Size = Op.getOperand(1);
9284 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
9285 EVT VT = Node->getValueType(0);
9286
9288 "no-stack-arg-probe")) {
9289 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
9290 Chain = SP.getValue(1);
9291 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
9292 if (Align)
9293 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
9294 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
9295 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
9296 SDValue Ops[2] = {SP, Chain};
9297 return DAG.getMergeValues(Ops, dl);
9298 }
9299
9300 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
9301
9302 Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
9303
9304 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
9305 Chain = SP.getValue(1);
9306 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
9307 if (Align)
9308 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
9309 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
9310 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
9311
9312 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
9313 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
9314
9315 SDValue Ops[2] = {SP, Chain};
9316 return DAG.getMergeValues(Ops, dl);
9317}
9318
9319SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
9320 SelectionDAG &DAG) const {
9321 EVT VT = Op.getValueType();
9322 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
9323
9324 SDLoc DL(Op);
9325 APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
9326 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
9327 DL, VT);
9328}
9329
9330/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
9331template <unsigned NumVecs>
9333 const CallInst &CI) {
9334 Info.opc = ISD::INTRINSIC_VOID;
9335 // Retrieve EC from first vector argument.
9336 const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType());
9338#ifndef NDEBUG
9339 // Check the assumption that all input vectors are the same type.
9340 for (unsigned I = 0; I < NumVecs; ++I)
9341 assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) &&
9342 "Invalid type.");
9343#endif
9344 // memVT is `NumVecs * VT`.
9345 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
9346 EC * NumVecs);
9347 Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
9348 Info.offset = 0;
9349 Info.align.reset();
9350 Info.flags = MachineMemOperand::MOStore;
9351 return true;
9352}
9353
9354/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
9355/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
9356/// specified in the intrinsic calls.
9358 const CallInst &I,
9359 MachineFunction &MF,
9360 unsigned Intrinsic) const {
9361 auto &DL = I.getModule()->getDataLayout();
9362 switch (Intrinsic) {
9363 case Intrinsic::aarch64_sve_st2:
9364 return setInfoSVEStN<2>(Info, I);
9365 case Intrinsic::aarch64_sve_st3:
9366 return setInfoSVEStN<3>(Info, I);
9367 case Intrinsic::aarch64_sve_st4:
9368 return setInfoSVEStN<4>(Info, I);
9369 case Intrinsic::aarch64_neon_ld2:
9370 case Intrinsic::aarch64_neon_ld3:
9371 case Intrinsic::aarch64_neon_ld4:
9372 case Intrinsic::aarch64_neon_ld1x2:
9373 case Intrinsic::aarch64_neon_ld1x3:
9374 case Intrinsic::aarch64_neon_ld1x4:
9375 case Intrinsic::aarch64_neon_ld2lane:
9376 case Intrinsic::aarch64_neon_ld3lane:
9377 case Intrinsic::aarch64_neon_ld4lane:
9378 case Intrinsic::aarch64_neon_ld2r:
9379 case Intrinsic::aarch64_neon_ld3r:
9380 case Intrinsic::aarch64_neon_ld4r: {
9381 Info.opc = ISD::INTRINSIC_W_CHAIN;
9382 // Conservatively set memVT to the entire set of vectors loaded.
9383 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
9384 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
9385 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
9386 Info.offset = 0;
9387 Info.align.reset();
9388 // volatile loads with NEON intrinsics not supported
9389 Info.flags = MachineMemOperand::MOLoad;
9390 return true;
9391 }
9392 case Intrinsic::aarch64_neon_st2:
9393 case Intrinsic::aarch64_neon_st3:
9394 case Intrinsic::aarch64_neon_st4:
9395 case Intrinsic::aarch64_neon_st1x2:
9396 case Intrinsic::aarch64_neon_st1x3:
9397 case Intrinsic::aarch64_neon_st1x4:
9398 case Intrinsic::aarch64_neon_st2lane:
9399 case Intrinsic::aarch64_neon_st3lane:
9400 case Intrinsic::aarch64_neon_st4lane: {
9401 Info.opc = ISD::INTRINSIC_VOID;
9402 // Conservatively set memVT to the entire set of vectors stored.
9403 unsigned NumElts = 0;
9404 for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
9405 Type *ArgTy = I.getArgOperand(ArgI)->getType();
9406 if (!ArgTy->isVectorTy())
9407 break;
9408 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
9409 }
9410 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
9411 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
9412 Info.offset = 0;
9413 Info.align.reset();
9414 // volatile stores with NEON intrinsics not supported
9415 Info.flags = MachineMemOperand::MOStore;
9416 return true;
9417 }
9418 case Intrinsic::aarch64_ldaxr:
9419 case Intrinsic::aarch64_ldxr: {
9420 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
9421 Info.opc = ISD::INTRINSIC_W_CHAIN;
9422 Info.memVT = MVT::getVT(PtrTy->getElementType());
9423 Info.ptrVal = I.getArgOperand(0);
9424 Info.offset = 0;
9425 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
9427 return true;
9428 }
9429 case Intrinsic::aarch64_stlxr:
9430 case Intrinsic::aarch64_stxr: {
9431 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
9432 Info.opc = ISD::INTRINSIC_W_CHAIN;
9433 Info.memVT = MVT::getVT(PtrTy->getElementType());
9434 Info.ptrVal = I.getArgOperand(1);
9435 Info.offset = 0;
9436 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
9438 return true;
9439 }
9440 case Intrinsic::aarch64_ldaxp:
9441 case Intrinsic::aarch64_ldxp:
9442 Info.opc = ISD::INTRINSIC_W_CHAIN;
9443 Info.memVT = MVT::i128;
9444 Info.ptrVal = I.getArgOperand(0);
9445 Info.offset = 0;
9446 Info.align = Align(16);
9448 return true;
9449 case Intrinsic::aarch64_stlxp:
9450 case Intrinsic::aarch64_stxp:
9451 Info.opc = ISD::INTRINSIC_W_CHAIN;
9452 Info.memVT = MVT::i128;
9453 Info.ptrVal = I.getArgOperand(2);
9454 Info.offset = 0;
9455 Info.align = Align(16);
9457 return true;
9458 case Intrinsic::aarch64_sve_ldnt1: {
9459 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
9460 Info.opc = ISD::INTRINSIC_W_CHAIN;
9461 Info.memVT = MVT::getVT(I.getType());
9462 Info.ptrVal = I.getArgOperand(1);
9463 Info.offset = 0;
9464 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
9465 Info.flags = MachineMemOperand::MOLoad;
9466 if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
9468 return true;
9469 }
9470 case Intrinsic::aarch64_sve_stnt1: {
9471 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
9472 Info.opc = ISD::INTRINSIC_W_CHAIN;
9473 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
9474 Info.ptrVal = I.getArgOperand(2);
9475 Info.offset = 0;
9476 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
9477 Info.flags = MachineMemOperand::MOStore;
9478 if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
9480 return true;
9481 }
9482 default:
9483 break;
9484 }
9485
9486 return false;
9487}
9488
9490 ISD::LoadExtType ExtTy,
9491 EVT NewVT) const {
9492 // TODO: This may be worth removing. Check regression tests for diffs.
9494 return false;
9495
9496 // If we're reducing the load width in order to avoid having to use an extra
9497 // instruction to do extension then it's probably a good idea.
9498 if (ExtTy != ISD::NON_EXTLOAD)
9499 return true;
9500 // Don't reduce load width if it would prevent us from combining a shift into
9501 // the offset.
9502 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
9503 assert(Mem);
9504 const SDValue &Base = Mem->getBasePtr();
9505 if (Base.getOpcode() == ISD::ADD &&
9506 Base.getOperand(1).getOpcode() == ISD::SHL &&
9507 Base.getOperand(1).hasOneUse() &&
9508 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
9509 // The shift can be combined if it matches the size of the value being
9510 // loaded (and so reducing the width would make it not match).
9511 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
9512 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
9513 if (ShiftAmount == Log2_32(LoadBytes))
9514 return false;
9515 }
9516 // We have no reason to disallow reducing the load width, so allow it.
9517 return true;
9518}
9519
9520// Truncations from 64-bit GPR to 32-bit GPR is free.
9522 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
9523 return false;
9524 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
9525 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
9526 return NumBits1 > NumBits2;
9527}
9529 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
9530 return false;
9531 unsigned NumBits1 = VT1.getSizeInBits();
9532 unsigned NumBits2 = VT2.getSizeInBits();
9533 return NumBits1 > NumBits2;
9534}
9535
9536/// Check if it is profitable to hoist instruction in then/else to if.
9537/// Not profitable if I and it's user can form a FMA instruction
9538/// because we prefer FMSUB/FMADD.
9540 if (I->getOpcode() != Instruction::FMul)
9541 return true;
9542
9543 if (!I->hasOneUse())
9544 return true;
9545
9547
9548 if (User &&
9549 !(User->getOpcode() == Instruction::FSub ||
9550 User->getOpcode() == Instruction::FAdd))
9551 return true;
9552
9553 const TargetOptions &Options = getTargetMachine().Options;
9554 const Function *F = I->getFunction();
9555 const DataLayout &DL = F->getParent()->getDataLayout();
9556 Type *Ty = User->getOperand(0)->getType();
9557
9558 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
9560 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
9561 Options.UnsafeFPMath));
9562}
9563
9564// All 32-bit GPR operations implicitly zero the high-half of the corresponding
9565// 64-bit GPR.
9567 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
9568 return false;
9569 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
9570 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
9571 return NumBits1 == 32 && NumBits2 == 64;
9572}
9574 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
9575 return false;
9576 unsigned NumBits1 = VT1.getSizeInBits();
9577 unsigned NumBits2 = VT2.getSizeInBits();
9578 return NumBits1 == 32 && NumBits2 == 64;
9579}
9580
9582 EVT VT1 = Val.getValueType();
9583 if (isZExtFree(VT1, VT2)) {
9584 return true;
9585 }
9586
9587 if (Val.getOpcode() != ISD::LOAD)
9588 return false;
9589
9590 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
9591 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
9592 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
9593 VT1.getSizeInBits() <= 32);
9594}
9595
9596bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
9597 if (isa<FPExtInst>(Ext))
9598 return false;
9599
9600 // Vector types are not free.
9601 if (Ext->getType()->isVectorTy())
9602 return false;
9603
9604 for (const Use &U : Ext->uses()) {
9605 // The extension is free if we can fold it with a left shift in an
9606 // addressing mode or an arithmetic operation: add, sub, and cmp.
9607
9608 // Is there a shift?
9609 const Instruction *Instr = cast<Instruction>(U.getUser());
9610
9611 // Is this a constant shift?
9612 switch (Instr->getOpcode()) {
9613 case Instruction::Shl:
9614 if (!isa<ConstantInt>(Instr->getOperand(1)))
9615 return false;
9616 break;
9619 auto &DL = Ext->getModule()->getDataLayout();
9620 std::advance(GTI, U.getOperandNo()-1);
9621 Type *IdxTy = GTI.getIndexedType();
9622 // This extension will end up with a shift because of the scaling factor.
9623 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
9624 // Get the shift amount based on the scaling factor:
9625 // log2(sizeof(IdxTy)) - log2(8).
9626 uint64_t ShiftAmt =
9627 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
9628 // Is the constant foldable in the shift of the addressing mode?
9629 // I.e., shift amount is between 1 and 4 inclusive.
9630 if (ShiftAmt == 0 || ShiftAmt > 4)
9631 return false;
9632 break;
9633 }
9634 case Instruction::Trunc:
9635 // Check if this is a noop.
9636 // trunc(sext ty1 to ty2) to ty1.
9637 if (Instr->getType() == Ext->getOperand(0)->getType())
9638 continue;
9640 default:
9641 return false;
9642 }
9643
9644 // At this point we can use the bfm family, so this extension is free
9645 // for that use.
9646 }
9647 return true;
9648}
9649
9650/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
9651/// or upper half of the vector elements.
9652static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
9653 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
9654 auto *FullTy = FullV->getType();
9655 auto *HalfTy = HalfV->getType();
9656 return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
9657 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
9658 };
9659
9660 auto extractHalf = [](Value *FullV, Value *HalfV) {
9661 auto *FullVT = cast<FixedVectorType>(FullV->getType());
9662 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
9663 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
9664 };
9665
9667 Value *S1Op1, *S2Op1;
9668 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
9670 return false;
9671
9672 // Check that the operands are half as wide as the result and we extract
9673 // half of the elements of the input vectors.
9674 if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
9675 !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
9676 return false;
9677
9678 // Check the mask extracts either the lower or upper half of vector
9679 // elements.
9680 int M1Start = -1;
9681 int M2Start = -1;
9682 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
9685 M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
9686 return false;
9687
9688 return true;
9689}
9690
9691/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
9692/// of the vector elements.
9694 auto areExtDoubled = [](Instruction *Ext) {
9695 return Ext->getType()->getScalarSizeInBits() ==
9696 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
9697 };
9698
9699 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
9703 return false;
9704
9705 return true;
9706}
9707
9708/// Check if Op could be used with vmull_high_p64 intrinsic.
9710 Value *VectorOperand = nullptr;
9711 ConstantInt *ElementIndex = nullptr;
9713 m_ConstantInt(ElementIndex))) &&
9714 ElementIndex->getValue() == 1 &&
9715 isa<FixedVectorType>(VectorOperand->getType()) &&
9716 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
9717}
9718
9719/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
9720static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
9722}
9723
9724/// Check if sinking \p I's operands to I's basic block is profitable, because
9725/// the operands can be folded into a target instruction, e.g.
9726/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
9728 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
9729 if (!I->getType()->isVectorTy())
9730 return false;
9731
9733 switch (II->getIntrinsicID()) {
9734 case Intrinsic::aarch64_neon_umull:
9735 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
9736 return false;
9737 Ops.push_back(&II->getOperandUse(0));
9738 Ops.push_back(&II->getOperandUse(1));
9739 return true;
9740
9741 case Intrinsic::aarch64_neon_pmull64:
9742 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
9743 II->getArgOperand(1)))
9744 return false;
9745 Ops.push_back(&II->getArgOperandUse(0));
9746 Ops.push_back(&II->getArgOperandUse(1));
9747 return true;
9748
9749 default:
9750 return false;
9751 }
9752 }
9753
9754 switch (I->getOpcode()) {
9755 case Instruction::Sub:
9756 case Instruction::Add: {
9757 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
9758 return false;
9759
9760 // If the exts' operands extract either the lower or upper elements, we
9761 // can sink them too.
9762 auto Ext1 = cast<Instruction>(I->getOperand(0));
9763 auto Ext2 = cast<Instruction>(I->getOperand(1));
9765 Ops.push_back(&Ext1->getOperandUse(0));
9766 Ops.push_back(&Ext2->getOperandUse(0));
9767 }
9768
9769 Ops.push_back(&I->getOperandUse(0));
9770 Ops.push_back(&I->getOperandUse(1));
9771
9772 return true;
9773 }
9774 default:
9775 return false;
9776 }
9777 return false;
9778}
9779
9781 Align &RequiredAligment) const {
9782 if (!LoadedType.isSimple() ||
9783 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
9784 return false;
9785 // Cyclone supports unaligned accesses.
9787 unsigned NumBits = LoadedType.getSizeInBits();
9788 return NumBits == 32 || NumBits == 64;
9789}
9790
9791/// A helper function for determining the number of interleaved accesses we
9792/// will generate when lowering accesses of the given type.
9793unsigned
9795 const DataLayout &DL) const {
9796 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
9797}
9798
9801 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
9802 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
9803 return MOStridedAccess;
9805}
9806
9808 VectorType *VecTy, const DataLayout &DL) const {
9809
9810 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
9811 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
9812
9813 // Ensure the number of vector elements is greater than 1.
9814 if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
9815 return false;
9816
9817 // Ensure the element type is legal.
9818 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
9819 return false;
9820
9821 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
9822 // 128 will be split into multiple interleaved accesses.
9823 return VecSize == 64 || VecSize % 128 == 0;
9824}
9825
9826/// Lower an interleaved load into a ldN intrinsic.
9827///
9828/// E.g. Lower an interleaved load (Factor = 2):
9829/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
9830/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
9831/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
9832///
9833/// Into:
9834/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
9835/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
9836/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
9839 ArrayRef<unsigned> Indices, unsigned Factor) const {
9840 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
9841 "Invalid interleave factor");
9842 assert(!Shuffles.empty() && "Empty shufflevector input");
9843 assert(Shuffles.size() == Indices.size() &&
9844 "Unmatched number of shufflevectors and indices");
9845
9846 const DataLayout &DL = LI->getModule()->getDataLayout();
9847
9848 VectorType *VTy = Shuffles[0]->getType();
9849
9850 // Skip if we do not have NEON and skip illegal vector types. We can
9851 // "legalize" wide vector types into multiple interleaved accesses as long as
9852 // the vector types are divisible by 128.
9853 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL))
9854 return false;
9855
9856 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);
9857
9858 auto *FVTy = cast<FixedVectorType>(VTy);
9859
9860 // A pointer vector can not be the return type of the ldN intrinsics. Need to
9861 // load integer vectors first and then convert to pointer vectors.
9862 Type *EltTy = FVTy->getElementType();
9863 if (EltTy->isPointerTy())
9864 FVTy =
9865 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
9866
9867 IRBuilder<> Builder(LI);
9868
9869 // The base address of the load.
9870 Value *BaseAddr = LI->getPointerOperand();
9871
9872 if (NumLoads > 1) {
9873 // If we're going to generate more than one load, reset the sub-vector type
9874 // to something legal.
9875 FVTy = FixedVectorType::get(FVTy->getElementType(),
9876 FVTy->getNumElements() / NumLoads);
9877
9878 // We will compute the pointer operand of each load from the original base
9879 // address using GEPs. Cast the base address to a pointer to the scalar
9880 // element type.
9881 BaseAddr = Builder.CreateBitCast(
9882 BaseAddr,
9883 FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
9884 }
9885
9886 Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
9887 Type *Tys[2] = {FVTy, PtrTy};
9888 static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
9889 Intrinsic::aarch64_neon_ld3,
9890 Intrinsic::aarch64_neon_ld4};
9891 Function *LdNFunc =
9892 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
9893
9894 // Holds sub-vectors extracted from the load intrinsic return values. The
9895 // sub-vectors are associated with the shufflevector instructions they will
9896 // replace.
9898
9899 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
9900
9901 // If we're generating more than one load, compute the base address of
9902 // subsequent loads as an offset from the previous.
9903 if (LoadCount > 0)
9904 BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
9905 FVTy->getNumElements() * Factor);
9906
9907 CallInst *LdN = Builder.CreateCall(
9908 LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
9909
9910 // Extract and store the sub-vectors returned by the load intrinsic.
9911 for (unsigned i = 0; i < Shuffles.size(); i++) {
9912 ShuffleVectorInst *SVI = Shuffles[i];
9913 unsigned Index = Indices[i];
9914
9915 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
9916
9917 // Convert the integer vector to pointer vector if the element is pointer.
9918 if (EltTy->isPointerTy())
9919 SubVec = Builder.CreateIntToPtr(
9921 FVTy->getNumElements()));
9922 SubVecs[SVI].push_back(SubVec);
9923 }
9924 }
9925
9926 // Replace uses of the shufflevector instructions with the sub-vectors
9927 // returned by the load intrinsic. If a shufflevector instruction is
9928 // associated with more than one sub-vector, those sub-vectors will be
9929 // concatenated into a single wide vector.
9930 for (ShuffleVectorInst *SVI : Shuffles) {
9931 auto &SubVec = SubVecs[SVI];
9932 auto *WideVec =
9933 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
9934 SVI->replaceAllUsesWith(WideVec);
9935 }
9936
9937 return true;
9938}
9939
9940/// Lower an interleaved store into a stN intrinsic.
9941///
9942/// E.g. Lower an interleaved store (Factor = 3):
9943/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
9944/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
9945/// store <12 x i32> %i.vec, <12 x i32>* %ptr
9946///
9947/// Into:
9948/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
9949/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
9950/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
9951/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
9952///
9953/// Note that the new shufflevectors will be removed and we'll only generate one
9954/// st3 instruction in CodeGen.
9955///
9956/// Example for a more general valid mask (Factor 3). Lower:
9957/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
9958/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
9959/// store <12 x i32> %i.vec, <12 x i32>* %ptr
9960///
9961/// Into:
9962/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
9963/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
9964/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
9965/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
9967 ShuffleVectorInst *SVI,
9968 unsigned Factor) const {
9969 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
9970 "Invalid interleave factor");
9971
9972 auto *VecTy = cast<FixedVectorType>(SVI->getType());
9973 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
9974
9975 unsigned LaneLen = VecTy->getNumElements() / Factor;
9976 Type *EltTy = VecTy->getElementType();
9977 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
9978
9979 const DataLayout &DL = SI->getModule()->getDataLayout();
9980
9981 // Skip if we do not have NEON and skip illegal vector types. We can
9982 // "legalize" wide vector types into multiple interleaved accesses as long as
9983 // the vector types are divisible by 128.
9984 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
9985 return false;
9986
9987 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
9988
9989 Value *Op0 = SVI->getOperand(0);
9990 Value *Op1 = SVI->getOperand(1);
9992
9993 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
9994 // vectors to integer vectors.
9995 if (EltTy->isPointerTy()) {
9996 Type *IntTy = DL.getIntPtrType(EltTy);
9997 unsigned NumOpElts =
9998 cast<FixedVectorType>(Op0->getType())->getNumElements();
9999
10000 // Convert to the corresponding integer vector.
10001 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
10002 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
10003 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
10004
10006 }
10007
10008 // The base address of the store.
10009 Value *BaseAddr = SI->getPointerOperand();
10010
10011 if (NumStores > 1) {
10012 // If we're going to generate more than one store, reset the lane length
10013 // and sub-vector type to something legal.
10014 LaneLen /= NumStores;
10015 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
10016
10017 // We will compute the pointer operand of each store from the original base
10018 // address using GEPs. Cast the base address to a pointer to the scalar
10019 // element type.
10020 BaseAddr = Builder.CreateBitCast(
10021 BaseAddr,
10022 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
10023 }
10024
10025 auto Mask = SVI->getShuffleMask();
10026
10027 Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
10028 Type *Tys[2] = {SubVecTy, PtrTy};
10029 static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
10030 Intrinsic::aarch64_neon_st3,
10031 Intrinsic::aarch64_neon_st4};
10032 Function *StNFunc =
10033 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
10034
10035 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
10036
10038
10039 // Split the shufflevector operands into sub vectors for the new stN call.
10040 for (unsigned i = 0; i < Factor; i++) {
10041 unsigned IdxI = StoreCount * LaneLen * Factor + i;
10042 if (Mask[IdxI] >= 0) {
10043 Ops.push_back(Builder.CreateShuffleVector(
10044 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
10045 } else {
10046 unsigned StartMask = 0;
10047 for (unsigned j = 1; j < LaneLen; j++) {
10048 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
10049 if (Mask[IdxJ * Factor + IdxI] >= 0) {
10050 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
10051 break;
10052 }
10053 }
10054 // Note: Filling undef gaps with random elements is ok, since
10055 // those elements were being written anyway (with undefs).
10056 // In the case of all undefs we're defaulting to using elems from 0
10057 // Note: StartMask cannot be negative, it's checked in
10058 // isReInterleaveMask
10059 Ops.push_back(Builder.CreateShuffleVector(
10060 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
10061 }
10062 }
10063
10064 // If we generating more than one store, we compute the base address of
10065 // subsequent stores as an offset from the previous.
10066 if (StoreCount > 0)
10067 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
10068 BaseAddr, LaneLen * Factor);
10069
10070 Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
10071 Builder.CreateCall(StNFunc, Ops);
10072 }
10073 return true;
10074}
10075
10076// Lower an SVE structured load intrinsic returning a tuple type to target
10077// specific intrinsic taking the same input but returning a multi-result value
10078// of the split tuple type.
10079//
10080// E.g. Lowering an LD3:
10081//
10082// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
10083// <vscale x 4 x i1> %pred,
10084// <vscale x 4 x i32>* %addr)
10085//
10086// Output DAG:
10087//
10088// t0: ch = EntryToken
10089// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
10090// t4: i64,ch = CopyFromReg t0, Register:i64 %1
10091// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
10092// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
10093//
10094// This is called pre-legalization to avoid widening/splitting issues with
10095// non-power-of-2 tuple types used for LD3, such as nxv12i32.
10096SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
10098 EVT VT, SelectionDAG &DAG,
10099 const SDLoc &DL) const {
10100 assert(VT.isScalableVector() && "Can only lower scalable vectors");
10101
10102 unsigned N, Opcode;
10103 static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
10104 {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
10105 {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
10106 {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
10107
10108 std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
10109 assert(VT.getVectorElementCount().Min % N == 0 &&
10110 "invalid tuple vector type!");
10111
10113 VT.getVectorElementCount() / N);
10115
10117 VTs.push_back(MVT::Other); // Chain
10118 SDVTList NodeTys = DAG.getVTList(VTs);
10119
10120 SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
10122 for (unsigned I = 0; I < N; ++I)
10123 PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
10124 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
10125}
10126
10128 const MemOp &Op, const AttributeList &FuncAttributes) const {
10129 bool CanImplicitFloat =
10130 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
10131 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
10132 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
10133 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
10134 // taken one instruction to materialize the v2i64 zero and one store (with
10135 // restrictive addressing mode). Just do i64 stores.
10136 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
10137 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
10138 if (Op.isAligned(AlignCheck))
10139 return true;
10140 bool Fast;
10142 &Fast) &&
10143 Fast;
10144 };
10145
10146 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
10148 return MVT::v2i64;
10150 return MVT::f128;
10151 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
10152 return MVT::i64;
10153 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
10154 return MVT::i32;
10155 return MVT::Other;
10156}
10157
10159 const MemOp &Op, const AttributeList &FuncAttributes) const {
10160 bool CanImplicitFloat =
10161 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
10162 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
10163 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
10164 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
10165 // taken one instruction to materialize the v2i64 zero and one store (with
10166 // restrictive addressing mode). Just do i64 stores.
10167 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
10168 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
10169 if (Op.isAligned(AlignCheck))
10170 return true;
10171 bool Fast;
10173 &Fast) &&
10174 Fast;
10175 };
10176
10177 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
10179 return LLT::vector(2, 64);
10181 return LLT::scalar(128);
10182 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
10183 return LLT::scalar(64);
10184 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
10185 return LLT::scalar(32);
10186 return LLT();
10187}
10188
10189// 12-bit optionally shifted immediates are legal for adds.
10191 if (Immed == std::numeric_limits<int64_t>::min()) {
10192 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
10193 << ": avoid UB for INT64_MIN\n");
10194 return false;
10195 }
10196 // Same encoding for add/sub, just flip the sign.
10197 Immed = std::abs(Immed);
10198 bool IsLegal = ((Immed >> 12) == 0 ||
10199 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
10200 LLVM_DEBUG(dbgs() << "Is " << Immed
10201 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
10202 return IsLegal;
10203}
10204
10205// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
10206// immediates is the same as for an add or a sub.
10210
10211/// isLegalAddressingMode - Return true if the addressing mode represented
10212/// by AM is legal for this target, for a load/store of the specified type.
10214 const AddrMode &AM, Type *Ty,
10215 unsigned AS, Instruction *I) const {
10216 // AArch64 has five basic addressing modes:
10217 // reg
10218 // reg + 9-bit signed offset
10219 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
10220 // reg1 + reg2
10221 // reg + SIZE_IN_BYTES * reg
10222
10223 // No global is ever allowed as a base.
10224 if (AM.BaseGV)
10225 return false;
10226
10227 // No reg+reg+imm addressing.
10228 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
10229 return false;
10230
10231 // FIXME: Update this method to support scalable addressing modes.
10233 return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
10234
10235 // check reg + imm case:
10236 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
10237 uint64_t NumBytes = 0;
10238 if (Ty->isSized()) {
10239 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
10240 NumBytes = NumBits / 8;
10241 if (!isPowerOf2_64(NumBits))
10242 NumBytes = 0;
10243 }
10244
10245 if (!AM.Scale) {
10246 int64_t Offset = AM.BaseOffs;
10247
10248 // 9-bit signed offset
10249 if (isInt<9>(Offset))
10250 return true;
10251
10252 // 12-bit unsigned offset
10253 unsigned shift = Log2_64(NumBytes);
10254 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
10255 // Must be a multiple of NumBytes (NumBytes is a power of 2)
10256 (Offset >> shift) << shift == Offset)
10257 return true;
10258 return false;
10259 }
10260
10261 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
10262
10263 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
10264}
10265
10267 // Consider splitting large offset of struct or array.
10268 return true;
10269}
10270
10272 const AddrMode &AM, Type *Ty,
10273 unsigned AS) const {
10274 // Scaling factors are not free at all.
10275 // Operands | Rt Latency
10276 // -------------------------------------------
10277 // Rt, [Xn, Xm] | 4
10278 // -------------------------------------------
10279 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
10280 // Rt, [Xn, Wm, <extend> #imm] |
10281 if (isLegalAddressingMode(DL, AM, Ty, AS))
10282 // Scale represents reg2 * scale, thus account for 1 if
10283 // it is not equal to 0 or 1.
10284 return AM.Scale != 0 && AM.Scale != 1;
10285 return -1;
10286}
10287
10289 const MachineFunction &MF, EVT VT) const {
10290 VT = VT.getScalarType();
10291
10292 if (!VT.isSimple())
10293 return false;
10294
10295 switch (VT.getSimpleVT().SimpleTy) {
10296 case MVT::f32:
10297 case MVT::f64:
10298 return true;
10299 default:
10300 break;
10301 }
10302
10303 return false;
10304}
10305
10307 Type *Ty) const {
10308 switch (Ty->getScalarType()->getTypeID()) {
10309 case Type::FloatTyID:
10310 case Type::DoubleTyID:
10311 return true;
10312 default:
10313 return false;
10314 }
10315}
10316
10317const MCPhysReg *
10319 // LR is a callee-save register, but we must treat it as clobbered by any call
10320 // site. Hence we include LR in the scratch registers, which are in turn added
10321 // as implicit-defs for stackmaps and patchpoints.
10322 static const MCPhysReg ScratchRegs[] = {
10323 AArch64::X16, AArch64::X17, AArch64::LR, 0
10324 };
10325 return ScratchRegs;
10326}
10327
10328bool
10330 CombineLevel Level) const {
10331 N = N->getOperand(0).getNode();
10332 EVT VT = N->getValueType(0);
10333 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
10334 // it with shift to let it be lowered to UBFX.
10335 if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
10336 isa<ConstantSDNode>(N->getOperand(1))) {
10337 uint64_t TruncMask = N->getConstantOperandVal(1);
10338 if (isMask_64(TruncMask) &&
10339 N->getOperand(0).getOpcode() == ISD::SRL &&
10340 isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
10341 return false;
10342 }
10343 return true;
10344}
10345
10347 Type *Ty) const {
10348 assert(Ty->isIntegerTy());
10349
10350 unsigned BitSize = Ty->getPrimitiveSizeInBits();
10351 if (BitSize == 0)
10352 return false;
10353
10354 int64_t Val = Imm.getSExtValue();
10355 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
10356 return true;
10357
10358 if ((int64_t)Val < 0)
10359 Val = ~Val;
10360 if (BitSize == 32)
10361 Val &= (1LL << 32) - 1;
10362
10363 unsigned LZ = countLeadingZeros((uint64_t)Val);
10364 unsigned Shift = (63 - LZ) / 16;
10365 // MOVZ is free so return true for one or fewer MOVK.
10366 return Shift < 3;
10367}
10368
10370 unsigned Index) const {
10372 return false;
10373
10374 return (Index == 0 || Index == ResVT.getVectorNumElements());
10375}
10376
10377/// Turn vector tests of the signbit in the form of:
10378/// xor (sra X, elt_size(X)-1), -1
10379/// into:
10380/// cmge X, X, #0
10382 const AArch64Subtarget *Subtarget) {
10383 EVT VT = N->getValueType(0);
10384 if (!Subtarget->hasNEON() || !VT.isVector())
10385 return SDValue();
10386
10387 // There must be a shift right algebraic before the xor, and the xor must be a
10388 // 'not' operation.
10389 SDValue Shift = N->getOperand(0);
10390 SDValue Ones = N->getOperand(1);
10391 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
10392 !ISD::isBuildVectorAllOnes(Ones.getNode()))
10393 return SDValue();
10394
10395 // The shift should be smearing the sign bit across each vector element.
10396 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
10398 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
10399 return SDValue();
10400
10401 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
10402}
10403
10404// Generate SUBS and CSEL for integer abs.
10406 EVT VT = N->getValueType(0);
10407
10408 SDValue N0 = N->getOperand(0);
10409 SDValue N1 = N->getOperand(1);
10410 SDLoc DL(N);
10411
10412 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
10413 // and change it to SUB and CSEL.
10414 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
10415 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
10416 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
10417 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
10418 if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
10419 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
10420 N0.getOperand(0));
10421 // Generate SUBS & CSEL.
10422 SDValue Cmp =
10424 N0.getOperand(0), DAG.getConstant(0, DL, VT));
10425 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
10427 SDValue(Cmp.getNode(), 1));
10428 }
10429 return SDValue();
10430}
10431
10434 const AArch64Subtarget *Subtarget) {
10435 if (DCI.isBeforeLegalizeOps())
10436 return SDValue();
10437
10438 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
10439 return Cmp;
10440
10441 return performIntegerAbsCombine(N, DAG);
10442}
10443
10444SDValue
10445AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
10446 SelectionDAG &DAG,
10449 if (isIntDivCheap(N->getValueType(0), Attr))
10450 return SDValue(N,0); // Lower SDIV as SDIV
10451
10452 // fold (sdiv X, pow2)
10453 EVT VT = N->getValueType(0);
10454 if ((VT != MVT::i32 && VT != MVT::i64) ||
10455 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
10456 return SDValue();
10457
10458 SDLoc DL(N);
10459 SDValue N0 = N->getOperand(0);
10460 unsigned Lg2 = Divisor.countTrailingZeros();
10461 SDValue Zero = DAG.getConstant(0, DL, VT);
10462 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
10463
10464 // Add (N0 < 0) ? Pow2 - 1 : 0;
10465 SDValue CCVal;
10466 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
10467 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
10468 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
10469
10470 Created.push_back(Cmp.getNode());
10471 Created.push_back(Add.getNode());
10472 Created.push_back(CSel.getNode());
10473
10474 // Divide by pow2.
10475 SDValue SRA =
10476 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
10477
10478 // If we're dividing by a positive value, we're done. Otherwise, we must
10479 // negate the result.
10480 if (Divisor.isNonNegative())
10481 return SRA;
10482
10483 Created.push_back(SRA.getNode());
10484 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
10485}
10486
10488 switch(getIntrinsicID(S.getNode())) {
10489 default:
10490 break;
10491 case Intrinsic::aarch64_sve_cntb:
10492 case Intrinsic::aarch64_sve_cnth:
10493 case Intrinsic::aarch64_sve_cntw:
10494 case Intrinsic::aarch64_sve_cntd:
10495 return true;
10496 }
10497 return false;
10498}
10499
10502 const AArch64Subtarget *Subtarget) {
10503 if (DCI.isBeforeLegalizeOps())
10504 return SDValue();
10505
10506 // The below optimizations require a constant RHS.
10507 if (!isa<ConstantSDNode>(N->getOperand(1)))
10508 return SDValue();
10509
10510 SDValue N0 = N->getOperand(0);
10511 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
10512 const APInt &ConstValue = C->getAPIntValue();
10513
10514 // Allow the scaling to be folded into the `cnt` instruction by preventing
10515 // the scaling to be obscured here. This makes it easier to pattern match.
10516 if (IsSVECntIntrinsic(N0) ||
10517 (N0->getOpcode() == ISD::TRUNCATE &&
10518 (IsSVECntIntrinsic(N0->getOperand(0)))))
10519 if (ConstValue.sge(1) && ConstValue.sle(16))
10520 return SDValue();
10521
10522 // Multiplication of a power of two plus/minus one can be done more
10523 // cheaply as as shift+add/sub. For now, this is true unilaterally. If
10524 // future CPUs have a cheaper MADD instruction, this may need to be
10525 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
10526 // 64-bit is 5 cycles, so this is always a win.
10527 // More aggressively, some multiplications N0 * C can be lowered to
10528 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
10529 // e.g. 6=3*2=(2+1)*2.
10530 // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
10531 // which equals to (1+2)*16-(1+2).
10532 // TrailingZeroes is used to test if the mul can be lowered to
10533 // shift+add+shift.
10534 unsigned TrailingZeroes = ConstValue.countTrailingZeros();
10535 if (TrailingZeroes) {
10536 // Conservatively do not lower to shift+add+shift if the mul might be
10537 // folded into smul or umul.
10538 if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
10539 isZeroExtended(N0.getNode(), DAG)))
10540 return SDValue();
10541 // Conservatively do not lower to shift+add+shift if the mul might be
10542 // folded into madd or msub.
10543 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
10544 N->use_begin()->getOpcode() == ISD::SUB))
10545 return SDValue();
10546 }
10547 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
10548 // and shift+add+shift.
10550
10551 unsigned ShiftAmt, AddSubOpc;
10552 // Is the shifted value the LHS operand of the add/sub?
10553 bool ShiftValUseIsN0 = true;
10554 // Do we need to negate the result?
10555 bool NegateResult = false;
10556
10557 if (ConstValue.isNonNegative()) {
10558 // (mul x, 2^N + 1) => (add (shl x, N), x)
10559 // (mul x, 2^N - 1) => (sub (shl x, N), x)
10560 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
10562 APInt CVPlus1 = ConstValue + 1;
10563 if (SCVMinus1.isPowerOf2()) {
10564 ShiftAmt = SCVMinus1.logBase2();
10565 AddSubOpc = ISD::ADD;
10566 } else if (CVPlus1.isPowerOf2()) {
10567 ShiftAmt = CVPlus1.logBase2();
10568 AddSubOpc = ISD::SUB;
10569 } else
10570 return SDValue();
10571 } else {
10572 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
10573 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
10576 if (CVNegPlus1.isPowerOf2()) {
10577 ShiftAmt = CVNegPlus1.logBase2();
10578 AddSubOpc = ISD::SUB;
10579 ShiftValUseIsN0 = false;
10580 } else if (CVNegMinus1.isPowerOf2()) {
10581 ShiftAmt = CVNegMinus1.logBase2();
10582 AddSubOpc = ISD::ADD;
10583 NegateResult = true;
10584 } else
10585 return SDValue();
10586 }
10587
10588 SDLoc DL(N);
10589 EVT VT = N->getValueType(0);
10590 SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
10591 DAG.getConstant(ShiftAmt, DL, MVT::i64));
10592
10595 SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
10597 "NegateResult and TrailingZeroes cannot both be true for now.");
10598 // Negate the result.
10599 if (NegateResult)
10600 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
10601 // Shift the result.
10602 if (TrailingZeroes)
10603 return DAG.getNode(ISD::SHL, DL, VT, Res,
10605 return Res;
10606}
10607
10609 SelectionDAG &DAG) {
10610 // Take advantage of vector comparisons producing 0 or -1 in each lane to
10611 // optimize away operation when it's from a constant.
10612 //
10613 // The general transformation is:
10614 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
10615 // AND(VECTOR_CMP(x,y), constant2)
10616 // constant2 = UNARYOP(constant)
10617
10618 // Early exit if this isn't a vector operation, the operand of the
10619 // unary operation isn't a bitwise AND, or if the sizes of the operations
10620 // aren't the same.
10621 EVT VT = N->getValueType(0);
10622 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
10623 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
10624 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
10625 return SDValue();
10626
10627 // Now check that the other operand of the AND is a constant. We could
10628 // make the transformation for non-constant splats as well, but it's unclear
10629 // that would be a benefit as it would not eliminate any operations, just
10630 // perform one more step in scalar code before moving to the vector unit.
10631 if (BuildVectorSDNode *BV =
10632 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
10633 // Bail out if the vector isn't a constant.
10634 if (!BV->isConstant())
10635 return SDValue();
10636
10637 // Everything checks out. Build up the new and improved node.
10638 SDLoc DL(N);
10639 EVT IntVT = BV->getValueType(0);
10640 // Create a new constant of the appropriate type for the transformed
10641 // DAG.
10642 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
10643 // The AND node needs bitcasts to/from an integer vector type around it.
10646 N->getOperand(0)->getOperand(0), MaskConst);
10647 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
10648 return Res;
10649 }
10650
10651 return SDValue();
10652}
10653
10655 const AArch64Subtarget *Subtarget) {
10656 // First try to optimize away the conversion when it's conditionally from
10657 // a constant. Vectors only.
10659 return Res;
10660
10661 EVT VT = N->getValueType(0);
10662 if (VT != MVT::f32 && VT != MVT::f64)
10663 return SDValue();
10664
10665 // Only optimize when the source and destination types have the same width.
10666 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
10667 return SDValue();
10668
10669 // If the result of an integer load is only used by an integer-to-float
10670 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
10671 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
10672 SDValue N0 = N->getOperand(0);
10673 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
10674 // Do not change the width of a volatile load.
10675 !cast<LoadSDNode>(N0)->isVolatile()) {
10677 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
10678 LN0->getPointerInfo(), LN0->getAlignment(),
10679 LN0->getMemOperand()->getFlags());
10680
10681 // Make sure successors of the original load stay after it by updating them
10682 // to use the new Chain.
10683 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
10684
10685 unsigned Opcode =
10687 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
10688 }
10689
10690 return SDValue();
10691}
10692
10693/// Fold a floating-point multiply by power of two into floating-point to
10694/// fixed-point conversion.
10697 const AArch64Subtarget *Subtarget) {
10698 if (!Subtarget->hasNEON())
10699 return SDValue();
10700
10701 if (!N->getValueType(0).isSimple())
10702 return SDValue();
10703
10704 SDValue Op = N->getOperand(0);
10705 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
10706 Op.getOpcode() != ISD::FMUL)
10707 return SDValue();
10708
10709 SDValue ConstVec = Op->getOperand(1);
10711 return SDValue();
10712
10713 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
10714 uint32_t FloatBits = FloatTy.getSizeInBits();
10715 if (FloatBits != 32 && FloatBits != 64)
10716 return SDValue();
10717
10718 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
10719 uint32_t IntBits = IntTy.getSizeInBits();
10720 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
10721 return SDValue();
10722
10723 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
10724 if (IntBits > FloatBits)
10725 return SDValue();
10726
10729 int32_t Bits = IntBits == 64 ? 64 : 32;
10730 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
10731 if (C == -1 || C == 0 || C > Bits)
10732 return SDValue();
10733
10734 MVT ResTy;
10735 unsigned NumLanes = Op.getValueType().getVectorNumElements();
10736 switch (NumLanes) {
10737 default:
10738 return SDValue();
10739 case 2:
10741 break;
10742 case 4:
10744 break;
10745 }
10746
10747 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
10748 return SDValue();
10749
10750 assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
10751 "Illegal vector type after legalization");
10752
10753 SDLoc DL(N);
10754 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
10755 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
10756 : Intrinsic::aarch64_neon_vcvtfp2fxu;
10760 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
10761 // We can handle smaller integers by generating an extra trunc.
10762 if (IntBits < FloatBits)
10763 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
10764
10765 return FixConv;
10766}
10767
10768/// Fold a floating-point divide by power of two into fixed-point to
10769/// floating-point conversion.
10772 const AArch64Subtarget *Subtarget) {
10773 if (!Subtarget->hasNEON())
10774 return SDValue();
10775
10776 SDValue Op = N->getOperand(0);
10777 unsigned Opc = Op->getOpcode();
10778 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
10779 !Op.getOperand(0).getValueType().isSimple() ||
10780 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
10781 return SDValue();
10782
10783 SDValue ConstVec = N->getOperand(1);
10785 return SDValue();
10786
10787 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
10788 int32_t IntBits = IntTy.getSizeInBits();
10789 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
10790 return SDValue();
10791
10792 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
10793 int32_t FloatBits = FloatTy.getSizeInBits();
10794 if (FloatBits != 32 && FloatBits != 64)
10795 return SDValue();
10796
10797 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
10798 if (IntBits > FloatBits)
10799 return SDValue();
10800
10803 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
10804 if (C == -1 || C == 0 || C > FloatBits)
10805 return SDValue();
10806
10807 MVT ResTy;
10808 unsigned NumLanes = Op.getValueType().getVectorNumElements();
10809 switch (NumLanes) {
10810 default:
10811 return SDValue();
10812 case 2:
10814 break;
10815 case 4:
10817 break;
10818 }
10819
10820 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
10821 return SDValue();
10822
10823 SDLoc DL(N);
10824 SDValue ConvInput = Op.getOperand(0);
10825 bool IsSigned = Opc == ISD::SINT_TO_FP;
10826 if (IntBits < FloatBits)
10828 ResTy, ConvInput);
10829
10830 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
10831 : Intrinsic::aarch64_neon_vcvtfxu2fp;
10832 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
10834 DAG.getConstant(C, DL, MVT::i32));
10835}
10836
10837/// An EXTR instruction is made up of two shifts, ORed together. This helper
10838/// searches for and classifies those shifts.
10839static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
10840 bool &FromHi) {
10841 if (N.getOpcode() == ISD::SHL)
10842 FromHi = false;
10843 else if (N.getOpcode() == ISD::SRL)
10844 FromHi = true;
10845 else
10846 return false;
10847
10848 if (!isa<ConstantSDNode>(N.getOperand(1)))
10849 return false;
10850
10851 ShiftAmount = N->getConstantOperandVal(1);
10852 Src = N->getOperand(0);
10853 return true;
10854}
10855
10856/// EXTR instruction extracts a contiguous chunk of bits from two existing
10857/// registers viewed as a high/low pair. This function looks for the pattern:
10858/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
10859/// with an EXTR. Can't quite be done in TableGen because the two immediates
10860/// aren't independent.
10863 SelectionDAG &DAG = DCI.DAG;
10864 SDLoc DL(N);
10865 EVT VT = N->getValueType(0);
10866
10867 assert(N->getOpcode() == ISD::OR && "Unexpected root");
10868
10869 if (VT != MVT::i32 && VT != MVT::i64)
10870 return SDValue();
10871
10872 SDValue LHS;
10873 uint32_t ShiftLHS = 0;
10874 bool LHSFromHi = false;
10875 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
10876 return SDValue();
10877
10878 SDValue RHS;
10879 uint32_t ShiftRHS = 0;
10880 bool RHSFromHi = false;
10881 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
10882 return SDValue();
10883
10884 // If they're both trying to come from the high part of the register, they're
10885 // not really an EXTR.
10886 if (LHSFromHi == RHSFromHi)
10887 return SDValue();
10888
10889 if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
10890 return SDValue();
10891
10892 if (LHSFromHi) {
10893 std::swap(LHS, RHS);
10895 }
10896
10897 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
10899}
10900
10903 EVT VT = N->getValueType(0);
10904 SelectionDAG &DAG = DCI.DAG;
10905 SDLoc DL(N);
10906
10907 if (!VT.isVector())
10908 return SDValue();
10909
10910 SDValue N0 = N->getOperand(0);
10911 if (N0.getOpcode() != ISD::AND)
10912 return SDValue();
10913
10914 SDValue N1 = N->getOperand(1);
10915 if (N1.getOpcode() != ISD::AND)
10916 return SDValue();
10917
10918 // We only have to look for constant vectors here since the general, variable
10919 // case can be handled in TableGen.
10920 unsigned Bits = VT.getScalarSizeInBits();
10921 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
10922 for (int i = 1; i >= 0; --i)
10923 for (int j = 1; j >= 0; --j) {
10926 if (!BVN0 || !BVN1)
10927 continue;
10928
10929 bool FoundMatch = true;
10930 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
10933 if (!CN0 || !CN1 ||
10934 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
10935 FoundMatch = false;
10936 break;
10937 }
10938 }
10939
10940 if (FoundMatch)
10941 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
10942 N0->getOperand(1 - i), N1->getOperand(1 - j));
10943 }
10944
10945 return SDValue();
10946}
10947
10949 const AArch64Subtarget *Subtarget) {
10950 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
10951 SelectionDAG &DAG = DCI.DAG;
10952 EVT VT = N->getValueType(0);
10953
10954 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
10955 return SDValue();
10956
10957 if (SDValue Res = tryCombineToEXTR(N, DCI))
10958 return Res;
10959
10960 if (SDValue Res = tryCombineToBSL(N, DCI))
10961 return Res;
10962
10963 return SDValue();
10964}
10965
10967 if (!MemVT.getVectorElementType().isSimple())
10968 return false;
10969
10970 uint64_t MaskForTy = 0ull;
10971 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
10972 case MVT::i8:
10973 MaskForTy = 0xffull;
10974 break;
10975 case MVT::i16:
10976 MaskForTy = 0xffffull;
10977 break;
10978 case MVT::i32:
10979 MaskForTy = 0xffffffffull;
10980 break;
10981 default:
10982 return false;
10983 break;
10984 }
10985
10986 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
10987 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
10988 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
10989
10990 return false;
10991}
10992
10995 if (DCI.isBeforeLegalizeOps())
10996 return SDValue();
10997
10998 SelectionDAG &DAG = DCI.DAG;
10999 SDValue Src = N->getOperand(0);
11000 unsigned Opc = Src->getOpcode();
11001
11002 // Zero/any extend of an unsigned unpack
11003 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
11004 SDValue UnpkOp = Src->getOperand(0);
11005 SDValue Dup = N->getOperand(1);
11006
11007 if (Dup.getOpcode() != AArch64ISD::DUP)
11008 return SDValue();
11009
11010 SDLoc DL(N);
11011 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
11012 uint64_t ExtVal = C->getZExtValue();
11013
11014 // If the mask is fully covered by the unpack, we don't need to push
11015 // a new AND onto the operand
11016 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
11017 if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
11018 (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
11019 (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
11020 return Src;
11021
11022 // Truncate to prevent a DUP with an over wide constant
11023 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
11024
11025 // Otherwise, make sure we propagate the AND to the operand
11026 // of the unpack
11028 UnpkOp->getValueType(0),
11029 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
11030
11031 SDValue And = DAG.getNode(ISD::AND, DL,
11032 UnpkOp->getValueType(0), UnpkOp, Dup);
11033
11034 return DAG.getNode(Opc, DL, N->getValueType(0), And);
11035 }
11036
11037 SDValue Mask = N->getOperand(1);
11038
11039 if (!Src.hasOneUse())
11040 return SDValue();
11041
11042 EVT MemVT;
11043
11044 // SVE load instructions perform an implicit zero-extend, which makes them
11045 // perfect candidates for combining.
11046 switch (Opc) {
11050 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
11051 break;
11067 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
11068 break;
11069 default:
11070 return SDValue();
11071 }
11072
11073 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
11074 return Src;
11075
11076 return SDValue();
11077}
11078
11081 SelectionDAG &DAG = DCI.DAG;
11082 SDValue LHS = N->getOperand(0);
11083 EVT VT = N->getValueType(0);
11084 if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
11085 return SDValue();
11086
11087 if (VT.isScalableVector())
11088 return performSVEAndCombine(N, DCI);
11089
11091 dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
11092 if (!BVN)
11093 return SDValue();
11094
11095 // AND does not accept an immediate, so check if we can use a BIC immediate
11096 // instruction instead. We do this here instead of using a (and x, (mvni imm))
11097 // pattern in isel, because some immediates may be lowered to the preferred
11098 // (and x, (movi imm)) form, even though an mvni representation also exists.
11099 APInt DefBits(VT.getSizeInBits(), 0);
11100 APInt UndefBits(VT.getSizeInBits(), 0);
11102 SDValue NewOp;
11103
11104 DefBits = ~DefBits;
11106 DefBits, &LHS)) ||
11108 DefBits, &LHS)))
11109 return NewOp;
11110
11113 UndefBits, &LHS)) ||
11115 UndefBits, &LHS)))
11116 return NewOp;
11117 }
11118
11119 return SDValue();
11120}
11121
11124 SelectionDAG &DAG = DCI.DAG;
11125 EVT VT = N->getValueType(0);
11126 if (VT != MVT::i32 && VT != MVT::i64)
11127 return SDValue();
11128
11129 // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
11130 // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
11131 // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
11132 SDValue N0 = N->getOperand(0);
11133 if (N0.getOpcode() == ISD::BSWAP) {
11134 SDLoc DL(N);
11135 SDValue N1 = N->getOperand(1);
11136 SDValue N00 = N0.getOperand(0);
11138 uint64_t ShiftAmt = C->getZExtValue();
11139 if (VT == MVT::i32 && ShiftAmt == 16 &&
11141 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
11142 if (VT == MVT::i64 && ShiftAmt == 32 &&
11144 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
11145 }
11146 }
11147 return SDValue();
11148}
11149
11152 SelectionDAG &DAG) {
11153 SDLoc dl(N);
11154 EVT VT = N->getValueType(0);
11155 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
11156 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
11157
11158 // Optimize concat_vectors of truncated vectors, where the intermediate
11159 // type is illegal, to avoid said illegality, e.g.,
11160 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
11161 // (v2i16 (truncate (v2i64)))))
11162 // ->
11163 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
11164 // (v4i32 (bitcast (v2i64))),
11165 // <0, 2, 4, 6>)))
11166 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
11167 // on both input and result type, so we might generate worse code.
11168 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
11169 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
11170 N1Opc == ISD::TRUNCATE) {
11171 SDValue N00 = N0->getOperand(0);
11172 SDValue N10 = N1->getOperand(0);
11173 EVT N00VT = N00.getValueType();
11174
11175 if (N00VT == N10.getValueType() &&
11176 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
11177 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
11179 SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
11180 for (size_t i = 0; i < Mask.size(); ++i)
11181 Mask[i] = i * 2;
11182 return DAG.getNode(ISD::TRUNCATE, dl, VT,
11183 DAG.getVectorShuffle(
11184 MidVT, dl,
11185 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
11186 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
11187 }
11188 }
11189
11190 // Wait 'til after everything is legalized to try this. That way we have
11191 // legal vector types and such.
11192 if (DCI.isBeforeLegalizeOps())
11193 return SDValue();
11194
11195 // Optimise concat_vectors of two [us]rhadds that use extracted subvectors
11196 // from the same original vectors. Combine these into a single [us]rhadd that
11197 // operates on the two original vectors. Example:
11198 // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
11199 // extract_subvector (v16i8 OpB,
11200 // <0>))),
11201 // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
11202 // extract_subvector (v16i8 OpB,
11203 // <8>)))))
11204 // ->
11205 // (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
11206 if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
11208 SDValue N00 = N0->getOperand(0);
11209 SDValue N01 = N0->getOperand(1);
11210 SDValue N10 = N1->getOperand(0);
11211 SDValue N11 = N1->getOperand(1);
11212
11213 EVT N00VT = N00.getValueType();
11214 EVT N10VT = N10.getValueType();
11215
11216 if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
11217 N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
11218 N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
11219 N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
11220 SDValue N00Source = N00->getOperand(0);
11221 SDValue N01Source = N01->getOperand(0);
11222 SDValue N10Source = N10->getOperand(0);
11223 SDValue N11Source = N11->getOperand(0);
11224
11225 if (N00Source == N10Source && N01Source == N11Source &&
11226 N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
11227 assert(N0.getValueType() == N1.getValueType());
11228
11229 uint64_t N00Index = N00.getConstantOperandVal(1);
11230 uint64_t N01Index = N01.getConstantOperandVal(1);
11231 uint64_t N10Index = N10.getConstantOperandVal(1);
11232 uint64_t N11Index = N11.getConstantOperandVal(1);
11233
11234 if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
11235 N10Index == N00VT.getVectorNumElements())
11236 return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
11237 }
11238 }
11239 }
11240
11241 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
11242 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
11243 // canonicalise to that.
11244 if (N0 == N1 && VT.getVectorNumElements() == 2) {
11245 assert(VT.getScalarSizeInBits() == 64);
11246 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
11247 DAG.getConstant(0, dl, MVT::i64));
11248 }
11249
11250 // Canonicalise concat_vectors so that the right-hand vector has as few
11251 // bit-casts as possible before its real operation. The primary matching
11252 // destination for these operations will be the narrowing "2" instructions,
11253 // which depend on the operation being performed on this right-hand vector.
11254 // For example,
11255 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
11256 // becomes
11257 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
11258
11259 if (N1Opc != ISD::BITCAST)
11260 return SDValue();
11261 SDValue RHS = N1->getOperand(0);
11263 // If the RHS is not a vector, this is not the pattern we're looking for.
11264 if (!RHSTy.isVector())
11265 return SDValue();
11266
11267 LLVM_DEBUG(
11268 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
11269
11270 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
11271 RHSTy.getVectorNumElements() * 2);
11272 return DAG.getNode(ISD::BITCAST, dl, VT,
11274 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
11275 RHS));
11276}
11277
11280 SelectionDAG &DAG) {
11281 // Wait until after everything is legalized to try this. That way we have
11282 // legal vector types and such.
11283 if (DCI.isBeforeLegalizeOps())
11284 return SDValue();
11285 // Transform a scalar conversion of a value from a lane extract into a
11286 // lane extract of a vector conversion. E.g., from foo1 to foo2:
11287 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
11288 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
11289 //
11290 // The second form interacts better with instruction selection and the
11291 // register allocator to avoid cross-class register copies that aren't
11292 // coalescable due to a lane reference.
11293
11294 // Check the operand and see if it originates from a lane extract.
11295 SDValue Op1 = N->getOperand(1);
11296 if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
11297 // Yep, no additional predication needed. Perform the transform.
11298 SDValue IID = N->getOperand(0);
11299 SDValue Shift = N->getOperand(2);
11300 SDValue Vec = Op1.getOperand(0);
11301 SDValue Lane = Op1.getOperand(1);
11302 EVT ResTy = N->getValueType(0);
11303 EVT VecResTy;
11304 SDLoc DL(N);
11305
11306 // The vector width should be 128 bits by the time we get here, even
11307 // if it started as 64 bits (the extract_vector handling will have
11308 // done so).
11309 assert(Vec.getValueSizeInBits() == 128 &&
11310 "unexpected vector size on extract_vector_elt!");
11311 if (Vec.getValueType() == MVT::v4i32)
11313 else if (Vec.getValueType() == MVT::v2i64)
11315 else
11316 llvm_unreachable("unexpected vector type!");
11317
11318 SDValue Convert =
11319 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
11320 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
11321 }
11322 return SDValue();
11323}
11324
11325// AArch64 high-vector "long" operations are formed by performing the non-high
11326// version on an extract_subvector of each operand which gets the high half:
11327//
11328// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
11329//
11330// However, there are cases which don't have an extract_high explicitly, but
11331// have another operation that can be made compatible with one for free. For
11332// example:
11333//
11334// (dupv64 scalar) --> (extract_high (dup128 scalar))
11335//
11336// This routine does the actual conversion of such DUPs, once outer routines
11337// have determined that everything else is in order.
11338// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
11339// similarly here.
11341 switch (N.getOpcode()) {
11342 case AArch64ISD::DUP:
11347 case AArch64ISD::MOVI:
11353 break;
11354 default:
11355 // FMOV could be supported, but isn't very useful, as it would only occur
11356 // if you passed a bitcast' floating point immediate to an eligible long
11357 // integer op (addl, smull, ...).
11358 return SDValue();
11359 }
11360
11361 MVT NarrowTy = N.getSimpleValueType();
11362 if (!NarrowTy.is64BitVector())
11363 return SDValue();
11364
11365 MVT ElementTy = NarrowTy.getVectorElementType();
11366 unsigned NumElems = NarrowTy.getVectorNumElements();
11367 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
11368
11369 SDLoc dl(N);
11370 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
11371 DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
11372 DAG.getConstant(NumElems, dl, MVT::i64));
11373}
11374
11376 if (N.getOpcode() == ISD::BITCAST)
11377 N = N.getOperand(0);
11378 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
11379 return false;
11380 return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
11381 N.getOperand(0).getValueType().getVectorNumElements() / 2;
11382}
11383
11384/// Helper structure to keep track of ISD::SET_CC operands.
11390
11391/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
11396
11397/// Helper structure to keep track of SetCC information.
11402
11403/// Helper structure to be able to read SetCC information. If set to
11404/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
11405/// GenericSetCCInfo.
11410
11411/// Check whether or not \p Op is a SET_CC operation, either a generic or
11412/// an
11413/// AArch64 lowered one.
11414/// \p SetCCInfo is filled accordingly.
11415/// \post SetCCInfo is meanginfull only when this function returns true.
11416/// \return True when Op is a kind of SET_CC operation.
11418 // If this is a setcc, this is straight forward.
11419 if (Op.getOpcode() == ISD::SETCC) {
11420 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
11421 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
11422 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
11423 SetCCInfo.IsAArch64 = false;
11424 return true;
11425 }
11426 // Otherwise, check if this is a matching csel instruction.
11427 // In other words:
11428 // - csel 1, 0, cc
11429 // - csel 0, 1, !cc
11430 if (Op.getOpcode() != AArch64ISD::CSEL)
11431 return false;
11432 // Set the information about the operands.
11433 // TODO: we want the operands of the Cmp not the csel
11434 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
11435 SetCCInfo.IsAArch64 = true;
11436 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
11437 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
11438
11439 // Check that the operands matches the constraints:
11440 // (1) Both operands must be constants.
11441 // (2) One must be 1 and the other must be 0.
11442 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
11443 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
11444
11445 // Check (1).
11446 if (!TValue || !FValue)
11447 return false;
11448
11449 // Check (2).
11450 if (!TValue->isOne()) {
11451 // Update the comparison when we are interested in !cc.
11453 SetCCInfo.Info.AArch64.CC =
11455 }
11456 return TValue->isOne() && FValue->isNullValue();
11457}
11458
11459// Returns true if Op is setcc or zext of setcc.
11460static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
11461 if (isSetCC(Op, Info))
11462 return true;
11463 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
11464 isSetCC(Op->getOperand(0), Info));
11465}
11466
11467// The folding we want to perform is:
11468// (add x, [zext] (setcc cc ...) )
11469// -->
11470// (csel x, (add x, 1), !cc ...)
11471//
11472// The latter will get matched to a CSINC instruction.
11474 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
11475 SDValue LHS = Op->getOperand(0);
11476 SDValue RHS = Op->getOperand(1);
11478
11479 // If neither operand is a SET_CC, give up.
11480 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
11481 std::swap(LHS, RHS);
11483 return SDValue();
11484 }
11485
11486 // FIXME: This could be generatized to work for FP comparisons.
11487 EVT CmpVT = InfoAndKind.IsAArch64
11488 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
11489 : InfoAndKind.Info.Generic.Opnd0->getValueType();
11490 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
11491 return SDValue();
11492
11493 SDValue CCVal;
11494 SDValue Cmp;
11495 SDLoc dl(Op);
11496 if (InfoAndKind.IsAArch64) {
11497 CCVal = DAG.getConstant(
11498 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
11499 MVT::i32);
11500 Cmp = *InfoAndKind.Info.AArch64.Cmp;
11501 } else
11502 Cmp = getAArch64Cmp(
11503 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
11504 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
11505 dl);
11506
11507 EVT VT = Op->getValueType(0);
11508 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
11509 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
11510}
11511
11512// The basic add/sub long vector instructions have variants with "2" on the end
11513// which act on the high-half of their inputs. They are normally matched by
11514// patterns like:
11515//
11516// (add (zeroext (extract_high LHS)),
11517// (zeroext (extract_high RHS)))
11518// -> uaddl2 vD, vN, vM
11519//
11520// However, if one of the extracts is something like a duplicate, this
11521// instruction can still be used profitably. This function puts the DAG into a
11522// more appropriate form for those patterns to trigger.
11525 SelectionDAG &DAG) {
11526 if (DCI.isBeforeLegalizeOps())
11527 return SDValue();
11528
11529 MVT VT = N->getSimpleValueType(0);
11530 if (!VT.is128BitVector()) {
11531 if (N->getOpcode() == ISD::ADD)
11532 return performSetccAddFolding(N, DAG);
11533 return SDValue();
11534 }
11535
11536 // Make sure both branches are extended in the same way.
11537 SDValue LHS = N->getOperand(0);
11538 SDValue RHS = N->getOperand(1);
11539 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
11540 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
11541 LHS.getOpcode() != RHS.getOpcode())
11542 return SDValue();
11543
11544 unsigned ExtType = LHS.getOpcode();
11545
11546 // It's not worth doing if at least one of the inputs isn't already an
11547 // extract, but we don't know which it'll be so we have to try both.
11549 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
11550 if (!RHS.getNode())
11551 return SDValue();
11552
11553 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
11554 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
11555 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
11556 if (!LHS.getNode())
11557 return SDValue();
11558
11559 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
11560 }
11561
11562 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
11563}
11564
11565// Massage DAGs which we can use the high-half "long" operations on into
11566// something isel will recognize better. E.g.
11567//
11568// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
11569// (aarch64_neon_umull (extract_high (v2i64 vec)))
11570// (extract_high (v2i64 (dup128 scalar)))))
11571//
11574 SelectionDAG &DAG) {
11575 if (DCI.isBeforeLegalizeOps())
11576 return SDValue();
11577
11578 SDValue LHS = N->getOperand(1);
11579 SDValue RHS = N->getOperand(2);
11581 RHS.getValueType().is64BitVector() &&
11582 "unexpected shape for long operation");
11583
11584 // Either node could be a DUP, but it's not worth doing both of them (you'd
11585 // just as well use the non-high version) so look for a corresponding extract
11586 // operation on the other "wing".
11588 RHS = tryExtendDUPToExtractHigh(RHS, DAG);
11589 if (!RHS.getNode())
11590 return SDValue();
11591 } else if (isEssentiallyExtractHighSubvector(RHS)) {
11592 LHS = tryExtendDUPToExtractHigh(LHS, DAG);
11593 if (!LHS.getNode())
11594 return SDValue();
11595 }
11596
11597 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
11598 N->getOperand(0), LHS, RHS);
11599}
11600
11601static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
11602 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
11603 unsigned ElemBits = ElemTy.getSizeInBits();
11604
11605 int64_t ShiftAmount;
11606 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
11607 APInt SplatValue, SplatUndef;
11608 unsigned SplatBitSize;
11609 bool HasAnyUndefs;
11610 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
11612 SplatBitSize != ElemBits)
11613 return SDValue();
11614
11615 ShiftAmount = SplatValue.getSExtValue();
11616 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
11617 ShiftAmount = CVN->getSExtValue();
11618 } else
11619 return SDValue();
11620
11621 unsigned Opcode;
11622 bool IsRightShift;
11623 switch (IID) {
11624 default:
11625 llvm_unreachable("Unknown shift intrinsic");
11626 case Intrinsic::aarch64_neon_sqshl:
11627 Opcode = AArch64ISD::SQSHL_I;
11628 IsRightShift = false;
11629 break;
11630 case Intrinsic::aarch64_neon_uqshl:
11631 Opcode = AArch64ISD::UQSHL_I;
11632 IsRightShift = false;
11633 break;
11634 case Intrinsic::aarch64_neon_srshl:
11635 Opcode = AArch64ISD::SRSHR_I;
11636 IsRightShift = true;
11637 break;
11638 case Intrinsic::aarch64_neon_urshl:
11639 Opcode = AArch64ISD::URSHR_I;
11640 IsRightShift = true;
11641 break;
11642 case Intrinsic::aarch64_neon_sqshlu:
11643 Opcode = AArch64ISD::SQSHLU_I;
11644 IsRightShift = false;
11645 break;
11646 case Intrinsic::aarch64_neon_sshl:
11647 case Intrinsic::aarch64_neon_ushl:
11648 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
11649 // left shift for positive shift amounts. Below, we only replace the current
11650 // node with VSHL, if this condition is met.
11651 Opcode = AArch64ISD::VSHL;
11652 IsRightShift = false;
11653 break;
11654 }
11655
11657 SDLoc dl(N);
11658 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
11659 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
11660 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
11661 SDLoc dl(N);
11662 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
11663 DAG.getConstant(ShiftAmount, dl, MVT::i32));
11664 }
11665
11666 return SDValue();
11667}
11668
11669// The CRC32[BH] instructions ignore the high bits of their data operand. Since
11670// the intrinsics must be legal and take an i32, this means there's almost
11671// certainly going to be a zext in the DAG which we can eliminate.
11672static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
11673 SDValue AndN = N->getOperand(2);
11674 if (AndN.getOpcode() != ISD::AND)
11675 return SDValue();
11676
11678 if (!CMask || CMask->getZExtValue() != Mask)
11679 return SDValue();
11680
11682 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
11683}
11684
11686 SelectionDAG &DAG) {
11687 SDLoc dl(N);
11688 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
11689 DAG.getNode(Opc, dl,
11690 N->getOperand(1).getSimpleValueType(),
11691 N->getOperand(1)),
11692 DAG.getConstant(0, dl, MVT::i64));
11693}
11694
11695static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc,
11696 SelectionDAG &DAG) {
11697 SDLoc dl(N);
11698 LLVMContext &Ctx = *DAG.getContext();
11699 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11700
11701 EVT VT = N->getValueType(0);
11702 SDValue Pred = N->getOperand(1);
11703 SDValue Data = N->getOperand(2);
11704 EVT DataVT = Data.getValueType();
11705
11706 if (DataVT.getVectorElementType().isScalarInteger() &&
11707 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64)) {
11708 if (!TLI.isTypeLegal(DataVT))
11709 return SDValue();
11710
11711 EVT OutputVT = EVT::getVectorVT(Ctx, VT,
11713 SDValue Reduce = DAG.getNode(Opc, dl, OutputVT, Pred, Data);
11714 SDValue Zero = DAG.getConstant(0, dl, MVT::i64);
11715 SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Reduce, Zero);
11716
11717 return Result;
11718 }
11719
11720 return SDValue();
11721}
11722
11724 SDLoc DL(N);
11725 SDValue Op1 = N->getOperand(1);
11726 SDValue Op2 = N->getOperand(2);
11727 EVT ScalarTy = Op1.getValueType();
11728
11729 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) {
11730 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
11731 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
11732 }
11733
11734 return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0),
11735 Op1, Op2);
11736}
11737
11739 SDLoc dl(N);
11740 SDValue Scalar = N->getOperand(3);
11741 EVT ScalarTy = Scalar.getValueType();
11742
11743 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
11744 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
11745
11746 SDValue Passthru = N->getOperand(1);
11747 SDValue Pred = N->getOperand(2);
11748 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
11749 Pred, Scalar, Passthru);
11750}
11751
11753 SDLoc dl(N);
11754 LLVMContext &Ctx = *DAG.getContext();
11755 EVT VT = N->getValueType(0);
11756
11757 assert(VT.isScalableVector() && "Expected a scalable vector.");
11758
11759 // Current lowering only supports the SVE-ACLE types.
11761 return SDValue();
11762
11763 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
11764 unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
11765 EVT ByteVT = EVT::getVectorVT(Ctx, MVT::i8, { ByteSize, true });
11766
11767 // Convert everything to the domain of EXT (i.e bytes).
11768 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
11769 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
11770 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
11771 DAG.getConstant(ElemSize, dl, MVT::i32));
11772
11773 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
11774 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
11775}
11776
11779 SelectionDAG &DAG) {
11780 if (DCI.isBeforeLegalize())
11781 return SDValue();
11782
11783 SDValue Comparator = N->getOperand(3);
11784 if (Comparator.getOpcode() == AArch64ISD::DUP ||
11785 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
11786 unsigned IID = getIntrinsicID(N);
11787 EVT VT = N->getValueType(0);
11788 EVT CmpVT = N->getOperand(2).getValueType();
11789 SDValue Pred = N->getOperand(1);
11790 SDValue Imm;
11791 SDLoc DL(N);
11792
11793 switch (IID) {
11794 default:
11795 llvm_unreachable("Called with wrong intrinsic!");
11796 break;
11797
11798 // Signed comparisons
11799 case Intrinsic::aarch64_sve_cmpeq_wide:
11800 case Intrinsic::aarch64_sve_cmpne_wide:
11801 case Intrinsic::aarch64_sve_cmpge_wide:
11802 case Intrinsic::aarch64_sve_cmpgt_wide:
11803 case Intrinsic::aarch64_sve_cmplt_wide:
11804 case Intrinsic::aarch64_sve_cmple_wide: {
11805 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
11806 int64_t ImmVal = CN->getSExtValue();
11807 if (ImmVal >= -16 && ImmVal <= 15)
11808 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
11809 else
11810 return SDValue();
11811 }
11812 break;
11813 }
11814 // Unsigned comparisons
11815 case Intrinsic::aarch64_sve_cmphs_wide:
11816 case Intrinsic::aarch64_sve_cmphi_wide:
11817 case Intrinsic::aarch64_sve_cmplo_wide:
11818 case Intrinsic::aarch64_sve_cmpls_wide: {
11819 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
11820 uint64_t ImmVal = CN->getZExtValue();
11821 if (ImmVal <= 127)
11822 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
11823 else
11824 return SDValue();
11825 }
11826 break;
11827 }
11828 }
11829
11830 if (!Imm)
11831 return SDValue();
11832
11833 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
11834 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
11835 N->getOperand(2), Splat, DAG.getCondCode(CC));
11836 }
11837
11838 return SDValue();
11839}
11840
11843 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11844
11845 SDLoc DL(Op);
11846 assert(Op.getValueType().isScalableVector() &&
11847 TLI.isTypeLegal(Op.getValueType()) &&
11848 "Expected legal scalable vector type!");
11849
11850 // Ensure target specific opcodes are using legal type.
11851 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
11852 SDValue TVal = DAG.getConstant(1, DL, OutVT);
11853 SDValue FVal = DAG.getConstant(0, DL, OutVT);
11854
11855 // Set condition code (CC) flags.
11857
11858 // Convert CC to integer based on requested condition.
11859 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
11860 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
11861 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
11862 return DAG.getZExtOrTrunc(Res, DL, VT);
11863}
11864
11866 SelectionDAG &DAG) {
11867 SDLoc DL(N);
11868
11869 SDValue Pred = N->getOperand(1);
11870 SDValue VecToReduce = N->getOperand(2);
11871
11872 EVT ReduceVT = VecToReduce.getValueType();
11873 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
11874
11875 // SVE reductions set the whole vector register with the first element
11876 // containing the reduction result, which we'll now extract.
11877 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11878 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
11879 Zero);
11880}
11881
11883 SelectionDAG &DAG) {
11884 SDLoc DL(N);
11885
11886 SDValue Pred = N->getOperand(1);
11887 SDValue InitVal = N->getOperand(2);
11888 SDValue VecToReduce = N->getOperand(3);
11889 EVT ReduceVT = VecToReduce.getValueType();
11890
11891 // Ordered reductions use the first lane of the result vector as the
11892 // reduction's initial value.
11893 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11895 DAG.getUNDEF(ReduceVT), InitVal, Zero);
11896
11897 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
11898
11899 // SVE reductions set the whole vector register with the first element
11900 // containing the reduction result, which we'll now extract.
11901 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
11902 Zero);
11903}
11904
11907 const AArch64Subtarget *Subtarget) {
11908 SelectionDAG &DAG = DCI.DAG;
11909 unsigned IID = getIntrinsicID(N);
11910 switch (IID) {
11911 default:
11912 break;
11913 case Intrinsic::aarch64_neon_vcvtfxs2fp:
11914 case Intrinsic::aarch64_neon_vcvtfxu2fp:
11915 return tryCombineFixedPointConvert(N, DCI, DAG);
11916 case Intrinsic::aarch64_neon_saddv:
11918 case Intrinsic::aarch64_neon_uaddv:
11920 case Intrinsic::aarch64_neon_sminv:
11922 case Intrinsic::aarch64_neon_uminv:
11924 case Intrinsic::aarch64_neon_smaxv:
11926 case Intrinsic::aarch64_neon_umaxv:
11928 case Intrinsic::aarch64_neon_fmax:
11929 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
11930 N->getOperand(1), N->getOperand(2));
11931 case Intrinsic::aarch64_neon_fmin:
11932 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
11933 N->getOperand(1), N->getOperand(2));
11934 case Intrinsic::aarch64_neon_fmaxnm:
11935 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
11936 N->getOperand(1), N->getOperand(2));
11937 case Intrinsic::aarch64_neon_fminnm:
11938 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
11939 N->getOperand(1), N->getOperand(2));
11940 case Intrinsic::aarch64_neon_smull:
11941 case Intrinsic::aarch64_neon_umull:
11942 case Intrinsic::aarch64_neon_pmull:
11943 case Intrinsic::aarch64_neon_sqdmull:
11944 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
11945 case Intrinsic::aarch64_neon_sqshl:
11946 case Intrinsic::aarch64_neon_uqshl:
11947 case Intrinsic::aarch64_neon_sqshlu:
11948 case Intrinsic::aarch64_neon_srshl:
11949 case Intrinsic::aarch64_neon_urshl:
11950 case Intrinsic::aarch64_neon_sshl:
11951 case Intrinsic::aarch64_neon_ushl:
11952 return tryCombineShiftImm(IID, N, DAG);
11953 case Intrinsic::aarch64_crc32b:
11954 case Intrinsic::aarch64_crc32cb:
11955 return tryCombineCRC32(0xff, N, DAG);
11956 case Intrinsic::aarch64_crc32h:
11957 case Intrinsic::aarch64_crc32ch:
11958 return tryCombineCRC32(0xffff, N, DAG);
11959 case Intrinsic::aarch64_sve_smaxv:
11961 case Intrinsic::aarch64_sve_umaxv:
11963 case Intrinsic::aarch64_sve_sminv:
11965 case Intrinsic::aarch64_sve_uminv:
11967 case Intrinsic::aarch64_sve_orv:
11969 case Intrinsic::aarch64_sve_eorv:
11971 case Intrinsic::aarch64_sve_andv:
11973 case Intrinsic::aarch64_sve_index:
11974 return LowerSVEIntrinsicIndex(N, DAG);
11975 case Intrinsic::aarch64_sve_dup:
11976 return LowerSVEIntrinsicDUP(N, DAG);
11977 case Intrinsic::aarch64_sve_dup_x:
11978 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
11979 N->getOperand(1));
11980 case Intrinsic::aarch64_sve_ext:
11981 return LowerSVEIntrinsicEXT(N, DAG);
11982 case Intrinsic::aarch64_sve_smin:
11983 return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
11984 N->getOperand(1), N->getOperand(2), N->getOperand(3));
11985 case Intrinsic::aarch64_sve_umin:
11986 return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
11987 N->getOperand(1), N->getOperand(2), N->getOperand(3));
11988 case Intrinsic::aarch64_sve_smax:
11989 return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
11990 N->getOperand(1), N->getOperand(2), N->getOperand(3));
11991 case Intrinsic::aarch64_sve_umax:
11992 return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
11993 N->getOperand(1), N->getOperand(2), N->getOperand(3));
11994 case Intrinsic::aarch64_sve_lsl:
11995 return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0),
11996 N->getOperand(1), N->getOperand(2), N->getOperand(3));
11997 case Intrinsic::aarch64_sve_lsr:
11998 return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0),
11999 N->getOperand(1), N->getOperand(2), N->getOperand(3));
12000 case Intrinsic::aarch64_sve_asr:
12001 return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0),
12002 N->getOperand(1), N->getOperand(2), N->getOperand(3));
12003 case Intrinsic::aarch64_sve_cmphs:
12004 if (!N->getOperand(2).getValueType().isFloatingPoint())
12006 N->getValueType(0), N->getOperand(1), N->getOperand(2),
12007 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
12008 break;
12009 case Intrinsic::aarch64_sve_cmphi:
12010 if (!N->getOperand(2).getValueType().isFloatingPoint())
12012 N->getValueType(0), N->getOperand(1), N->getOperand(2),
12013 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
12014 break;
12015 case Intrinsic::aarch64_sve_cmpge:
12016 if (!N->getOperand(2).getValueType().isFloatingPoint())
12018 N->getValueType(0), N->getOperand(1), N->getOperand(2),
12019 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
12020 break;
12021 case Intrinsic::aarch64_sve_cmpgt:
12022 if (!N->getOperand(2).getValueType().isFloatingPoint())
12024 N->getValueType(0), N->getOperand(1), N->getOperand(2),
12025 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
12026 break;
12027 case Intrinsic::aarch64_sve_cmpeq:
12028 if (!N->getOperand(2).getValueType().isFloatingPoint())
12030 N->getValueType(0), N->getOperand(1), N->getOperand(2),
12031 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
12032 break;
12033 case Intrinsic::aarch64_sve_cmpne:
12034 if (!N->getOperand(2).getValueType().isFloatingPoint())
12036 N->getValueType(0), N->getOperand(1), N->getOperand(2),
12037 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
12038 break;
12039 case Intrinsic::aarch64_sve_fadda:
12041 case Intrinsic::aarch64_sve_faddv:
12043 case Intrinsic::aarch64_sve_fmaxnmv:
12045 case Intrinsic::aarch64_sve_fmaxv:
12047 case Intrinsic::aarch64_sve_fminnmv:
12049 case Intrinsic::aarch64_sve_fminv:
12051 case Intrinsic::aarch64_sve_sel:
12052 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
12053 N->getOperand(1), N->getOperand(2), N->getOperand(3));
12054 case Intrinsic::aarch64_sve_cmpeq_wide:
12056 case Intrinsic::aarch64_sve_cmpne_wide:
12058 case Intrinsic::aarch64_sve_cmpge_wide:
12060 case Intrinsic::aarch64_sve_cmpgt_wide:
12062 case Intrinsic::aarch64_sve_cmplt_wide:
12064 case Intrinsic::aarch64_sve_cmple_wide:
12066 case Intrinsic::aarch64_sve_cmphs_wide:
12068 case Intrinsic::aarch64_sve_cmphi_wide:
12070 case Intrinsic::aarch64_sve_cmplo_wide:
12072 case Intrinsic::aarch64_sve_cmpls_wide:
12074 case Intrinsic::aarch64_sve_ptest_any:
12075 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
12077 case Intrinsic::aarch64_sve_ptest_first:
12078 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
12080 case Intrinsic::aarch64_sve_ptest_last:
12081 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
12083 }
12084 return SDValue();
12085}
12086
12089 SelectionDAG &DAG) {
12090 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
12091 // we can convert that DUP into another extract_high (of a bigger DUP), which
12092 // helps the backend to decide that an sabdl2 would be useful, saving a real
12093 // extract_high operation.
12094 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
12095 N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
12096 SDNode *ABDNode = N->getOperand(0).getNode();
12097 unsigned IID = getIntrinsicID(ABDNode);
12098 if (IID == Intrinsic::aarch64_neon_sabd ||
12099 IID == Intrinsic::aarch64_neon_uabd) {
12101 if (!NewABD.getNode())
12102 return SDValue();
12103
12104 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
12105 NewABD);
12106 }
12107 }
12108
12109 // This is effectively a custom type legalization for AArch64.
12110 //
12111 // Type legalization will split an extend of a small, legal, type to a larger
12112 // illegal type by first splitting the destination type, often creating
12113 // illegal source types, which then get legalized in isel-confusing ways,
12114 // leading to really terrible codegen. E.g.,
12115 // %result = v8i32 sext v8i8 %value
12116 // becomes
12117 // %losrc = extract_subreg %value, ...
12118 // %hisrc = extract_subreg %value, ...
12119 // %lo = v4i32 sext v4i8 %losrc
12120 // %hi = v4i32 sext v4i8 %hisrc
12121 // Things go rapidly downhill from there.
12122 //
12123 // For AArch64, the [sz]ext vector instructions can only go up one element
12124 // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
12125 // take two instructions.
12126 //
12127 // This implies that the most efficient way to do the extend from v8i8
12128 // to two v4i32 values is to first extend the v8i8 to v8i16, then do
12129 // the normal splitting to happen for the v8i16->v8i32.
12130
12131 // This is pre-legalization to catch some cases where the default
12132 // type legalization will create ill-tempered code.
12133 if (!DCI.isBeforeLegalizeOps())
12134 return SDValue();
12135
12136 // We're only interested in cleaning things up for non-legal vector types
12137 // here. If both the source and destination are legal, things will just
12138 // work naturally without any fiddling.
12139 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12140 EVT ResVT = N->getValueType(0);
12141 if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
12142 return SDValue();
12143 // If the vector type isn't a simple VT, it's beyond the scope of what
12144 // we're worried about here. Let legalization do its thing and hope for
12145 // the best.
12146 SDValue Src = N->getOperand(0);
12147 EVT SrcVT = Src->getValueType(0);
12148 if (!ResVT.isSimple() || !SrcVT.isSimple())
12149 return SDValue();
12150
12151 // If the source VT is a 64-bit fixed or scalable vector, we can play games
12152 // and get the better results we want.
12153 if (SrcVT.getSizeInBits().getKnownMinSize() != 64)
12154 return SDValue();
12155
12156 unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
12157 ElementCount SrcEC = SrcVT.getVectorElementCount();
12159 SDLoc DL(N);
12160 Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
12161
12162 // Now split the rest of the operation into two halves, each with a 64
12163 // bit source.
12164 EVT LoVT, HiVT;
12165 SDValue Lo, Hi;
12166 LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext());
12167
12168 EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
12169 LoVT.getVectorElementCount());
12170 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
12171 DAG.getConstant(0, DL, MVT::i64));
12172 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
12173 DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64));
12174 Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
12175 Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
12176
12177 // Now combine the parts back together so we still have a single result
12178 // like the combiner expects.
12179 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
12180}
12181
12183 SDValue SplatVal, unsigned NumVecElts) {
12184 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
12185 unsigned OrigAlignment = St.getAlignment();
12186 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
12187
12188 // Create scalar stores. This is at least as good as the code sequence for a
12189 // split unaligned store which is a dup.s, ext.b, and two stores.
12190 // Most of the time the three stores should be replaced by store pair
12191 // instructions (stp).
12192 SDLoc DL(&St);
12193 SDValue BasePtr = St.getBasePtr();
12194 uint64_t BaseOffset = 0;
12195
12196 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
12197 SDValue NewST1 =
12198 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
12199 OrigAlignment, St.getMemOperand()->getFlags());
12200
12201 // As this in ISel, we will not merge this add which may degrade results.
12202 if (BasePtr->getOpcode() == ISD::ADD &&
12203 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
12204 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
12205 BasePtr = BasePtr->getOperand(0);
12206 }
12207
12208 unsigned Offset = EltOffset;
12209 while (--NumVecElts) {
12210 unsigned Alignment = MinAlign(OrigAlignment, Offset);
12211 SDValue OffsetPtr =
12212 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
12213 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
12214 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
12215 PtrInfo.getWithOffset(Offset), Alignment,
12216 St.getMemOperand()->getFlags());
12217 Offset += EltOffset;
12218 }
12219 return NewST1;
12220}
12221
12222// Returns an SVE type that ContentTy can be trivially sign or zero extended
12223// into.
12225 assert(ContentTy.isSimple() && "No SVE containers for extended types");
12226
12227 switch (ContentTy.getSimpleVT().SimpleTy) {
12228 default:
12229 llvm_unreachable("No known SVE container for this MVT type");
12230 case MVT::nxv2i8:
12231 case MVT::nxv2i16:
12232 case MVT::nxv2i32:
12233 case MVT::nxv2i64:
12234 case MVT::nxv2f32:
12235 case MVT::nxv2f64:
12236 return MVT::nxv2i64;
12237 case MVT::nxv4i8:
12238 case MVT::nxv4i16:
12239 case MVT::nxv4i32:
12240 case MVT::nxv4f32:
12241 return MVT::nxv4i32;
12242 case MVT::nxv8i8:
12243 case MVT::nxv8i16:
12244 case MVT::nxv8f16:
12245 case MVT::nxv8bf16:
12246 return MVT::nxv8i16;
12247 case MVT::nxv16i8:
12248 return MVT::nxv16i8;
12249 }
12250}
12251
12252static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
12253 SDLoc DL(N);
12254 EVT VT = N->getValueType(0);
12255
12257 return SDValue();
12258
12259 EVT ContainerVT = VT;
12260 if (ContainerVT.isInteger())
12262
12264 SDValue Ops[] = { N->getOperand(0), // Chain
12265 N->getOperand(2), // Pg
12266 N->getOperand(3), // Base
12267 DAG.getValueType(VT) };
12268
12269 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
12270 SDValue LoadChain = SDValue(Load.getNode(), 1);
12271
12272 if (ContainerVT.isInteger() && (VT != ContainerVT))
12273 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
12274
12275 return DAG.getMergeValues({ Load, LoadChain }, DL);
12276}
12277
12279 SDLoc DL(N);
12280 EVT VT = N->getValueType(0);
12281 EVT PtrTy = N->getOperand(3).getValueType();
12282
12283 if (VT == MVT::nxv8bf16 &&
12284 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
12285 return SDValue();
12286
12287 EVT LoadVT = VT;
12288 if (VT.isFloatingPoint())
12290
12293 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
12294 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
12295 MINode->getOperand(2), PassThru,
12296 MINode->getMemoryVT(), MINode->getMemOperand(),
12298
12299 if (VT.isFloatingPoint()) {
12300 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
12301 return DAG.getMergeValues(Ops, DL);
12302 }
12303
12304 return L;
12305}
12306
12307template <unsigned Opcode>
12309 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
12311 "Unsupported opcode.");
12312 SDLoc DL(N);
12313 EVT VT = N->getValueType(0);
12314 if (VT == MVT::nxv8bf16 &&
12315 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
12316 return SDValue();
12317
12318 EVT LoadVT = VT;
12319 if (VT.isFloatingPoint())
12321
12322 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
12323 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
12324 SDValue LoadChain = SDValue(Load.getNode(), 1);
12325
12326 if (VT.isFloatingPoint())
12327 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
12328
12329 return DAG.getMergeValues({Load, LoadChain}, DL);
12330}
12331
12333 SDLoc DL(N);
12334 SDValue Data = N->getOperand(2);
12335 EVT DataVT = Data.getValueType();
12338
12339 if (DataVT == MVT::nxv8bf16 &&
12340 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
12341 return SDValue();
12342
12343 if (DataVT.isFloatingPoint())
12345
12347 if (Data.getValueType().isFloatingPoint())
12349 else
12351
12352 SDValue Ops[] = { N->getOperand(0), // Chain
12353 SrcNew,
12354 N->getOperand(4), // Base
12355 N->getOperand(3), // Pg
12356 InputVT
12357 };
12358
12359 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
12360}
12361
12363 SDLoc DL(N);
12364
12365 SDValue Data = N->getOperand(2);
12366 EVT DataVT = Data.getValueType();
12367 EVT PtrTy = N->getOperand(4).getValueType();
12368
12369 if (DataVT == MVT::nxv8bf16 &&
12370 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
12371 return SDValue();
12372
12373 if (DataVT.isFloatingPoint())
12374 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
12375
12377 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
12378 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
12379 MINode->getMemoryVT(), MINode->getMemOperand(),
12380 ISD::UNINDEXED, false, false);
12381}
12382
12383/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
12384/// load store optimizer pass will merge them to store pair stores. This should
12385/// be better than a movi to create the vector zero followed by a vector store
12386/// if the zero constant is not re-used, since one instructions and one register
12387/// live range will be removed.
12388///
12389/// For example, the final generated code should be:
12390///
12391/// stp xzr, xzr, [x0]
12392///
12393/// instead of:
12394///
12395/// movi v0.2d, #0
12396/// str q0, [x0]
12397///
12399 SDValue StVal = St.getValue();
12400 EVT VT = StVal.getValueType();
12401
12402 // Avoid scalarizing zero splat stores for scalable vectors.
12403 if (VT.isScalableVector())
12404 return SDValue();
12405
12406 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
12407 // 2, 3 or 4 i32 elements.
12409 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
12410 VT.getVectorElementType().getSizeInBits() == 64) ||
12411 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
12412 VT.getVectorElementType().getSizeInBits() == 32)))
12413 return SDValue();
12414
12415 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
12416 return SDValue();
12417
12418 // If the zero constant has more than one use then the vector store could be
12419 // better since the constant mov will be amortized and stp q instructions
12420 // should be able to be formed.
12421 if (!StVal.hasOneUse())
12422 return SDValue();
12423
12424 // If the store is truncating then it's going down to i16 or smaller, which
12425 // means it can be implemented in a single store anyway.
12426 if (St.isTruncatingStore())
12427 return SDValue();
12428
12429 // If the immediate offset of the address operand is too large for the stp
12430 // instruction, then bail out.
12431 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
12432 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
12434 return SDValue();
12435 }
12436
12437 for (int I = 0; I < NumVecElts; ++I) {
12438 SDValue EltVal = StVal.getOperand(I);
12440 return SDValue();
12441 }
12442
12443 // Use a CopyFromReg WZR/XZR here to prevent
12444 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
12445 SDLoc DL(&St);
12446 unsigned ZeroReg;
12447 EVT ZeroVT;
12448 if (VT.getVectorElementType().getSizeInBits() == 32) {
12449 ZeroReg = AArch64::WZR;
12450 ZeroVT = MVT::i32;
12451 } else {
12452 ZeroReg = AArch64::XZR;
12453 ZeroVT = MVT::i64;
12454 }
12457 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
12458}
12459
12460/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
12461/// value. The load store optimizer pass will merge them to store pair stores.
12462/// This has better performance than a splat of the scalar followed by a split
12463/// vector store. Even if the stores are not merged it is four stores vs a dup,
12464/// followed by an ext.b and two stores.
12466 SDValue StVal = St.getValue();
12467 EVT VT = StVal.getValueType();
12468
12469 // Don't replace floating point stores, they possibly won't be transformed to
12470 // stp because of the store pair suppress pass.
12471 if (VT.isFloatingPoint())
12472 return SDValue();
12473
12474 // We can express a splat as store pair(s) for 2 or 4 elements.
12475 unsigned NumVecElts = VT.getVectorNumElements();
12476 if (NumVecElts != 4 && NumVecElts != 2)
12477 return SDValue();
12478
12479 // If the store is truncating then it's going down to i16 or smaller, which
12480 // means it can be implemented in a single store anyway.
12481 if (St.isTruncatingStore())
12482 return SDValue();
12483
12484 // Check that this is a splat.
12485 // Make sure that each of the relevant vector element locations are inserted
12486 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
12487 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
12489 for (unsigned I = 0; I < NumVecElts; ++I) {
12490 // Check for insert vector elements.
12491 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
12492 return SDValue();
12493
12494 // Check that same value is inserted at each vector element.
12495 if (I == 0)
12496 SplatVal = StVal.getOperand(1);
12497 else if (StVal.getOperand(1) != SplatVal)
12498 return SDValue();
12499
12500 // Check insert element index.
12502 if (!CIndex)
12503 return SDValue();
12504 uint64_t IndexVal = CIndex->getZExtValue();
12505 if (IndexVal >= NumVecElts)
12506 return SDValue();
12508
12509 StVal = StVal.getOperand(0);
12510 }
12511 // Check that all vector element locations were inserted to.
12512 if (IndexNotInserted.any())
12513 return SDValue();
12514
12515 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
12516}
12517
12519 SelectionDAG &DAG,
12520 const AArch64Subtarget *Subtarget) {
12521
12523 if (S->isVolatile() || S->isIndexed())
12524 return SDValue();
12525
12526 SDValue StVal = S->getValue();
12527 EVT VT = StVal.getValueType();
12528
12529 if (!VT.isFixedLengthVector())
12530 return SDValue();
12531
12532 // If we get a splat of zeros, convert this vector store to a store of
12533 // scalars. They will be merged into store pairs of xzr thereby removing one
12534 // instruction and one register.
12536 return ReplacedZeroSplat;
12537
12538 // FIXME: The logic for deciding if an unaligned store should be split should
12539 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
12540 // a call to that function here.
12541
12542 if (!Subtarget->isMisaligned128StoreSlow())
12543 return SDValue();
12544
12545 // Don't split at -Oz.
12547 return SDValue();
12548
12549 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
12550 // those up regresses performance on micro-benchmarks and olden/bh.
12551 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
12552 return SDValue();
12553
12554 // Split unaligned 16B stores. They are terrible for performance.
12555 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
12556 // extensions can use this to mark that it does not want splitting to happen
12557 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
12558 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
12559 if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
12560 S->getAlignment() <= 2)
12561 return SDValue();
12562
12563 // If we get a splat of a scalar convert this vector store to a store of
12564 // scalars. They will be merged into store pairs thereby removing two
12565 // instructions.
12567 return ReplacedSplat;
12568
12569 SDLoc DL(S);
12570
12571 // Split VT into two.
12573 unsigned NumElts = HalfVT.getVectorNumElements();
12575 DAG.getConstant(0, DL, MVT::i64));
12578 SDValue BasePtr = S->getBasePtr();
12579 SDValue NewST1 =
12580 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
12581 S->getAlignment(), S->getMemOperand()->getFlags());
12582 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
12583 DAG.getConstant(8, DL, MVT::i64));
12584 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
12585 S->getPointerInfo(), S->getAlignment(),
12586 S->getMemOperand()->getFlags());
12587}
12588
12589/// Target-specific DAG combine function for post-increment LD1 (lane) and
12590/// post-increment LD1R.
12593 bool IsLaneOp) {
12594 if (DCI.isBeforeLegalizeOps())
12595 return SDValue();
12596
12597 SelectionDAG &DAG = DCI.DAG;
12598 EVT VT = N->getValueType(0);
12599
12600 if (VT.isScalableVector())
12601 return SDValue();
12602
12603 unsigned LoadIdx = IsLaneOp ? 1 : 0;
12604 SDNode *LD = N->getOperand(LoadIdx).getNode();
12605 // If it is not LOAD, can not do such combine.
12606 if (LD->getOpcode() != ISD::LOAD)
12607 return SDValue();
12608
12609 // The vector lane must be a constant in the LD1LANE opcode.
12610 SDValue Lane;
12611 if (IsLaneOp) {
12612 Lane = N->getOperand(2);
12613 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
12614 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
12615 return SDValue();
12616 }
12617
12619 EVT MemVT = LoadSDN->getMemoryVT();
12620 // Check if memory operand is the same type as the vector element.
12621 if (MemVT != VT.getVectorElementType())
12622 return SDValue();
12623
12624 // Check if there are other uses. If so, do not combine as it will introduce
12625 // an extra load.
12626 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
12627 ++UI) {
12628 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
12629 continue;
12630 if (*UI != N)
12631 return SDValue();
12632 }
12633
12634 SDValue Addr = LD->getOperand(1);
12635 SDValue Vector = N->getOperand(0);
12636 // Search for a use of the address operand that is an increment.
12637 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
12638 Addr.getNode()->use_end(); UI != UE; ++UI) {
12639 SDNode *User = *UI;
12640 if (User->getOpcode() != ISD::ADD
12641 || UI.getUse().getResNo() != Addr.getResNo())
12642 continue;
12643
12644 // If the increment is a constant, it must match the memory ref size.
12645 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
12647 uint32_t IncVal = CInc->getZExtValue();
12648 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
12649 if (IncVal != NumBytes)
12650 continue;
12651 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
12652 }
12653
12654 // To avoid cycle construction make sure that neither the load nor the add
12655 // are predecessors to each other or the Vector.
12658 Visited.insert(Addr.getNode());
12659 Worklist.push_back(User);
12660 Worklist.push_back(LD);
12661 Worklist.push_back(Vector.getNode());
12662 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
12663 SDNode::hasPredecessorHelper(User, Visited, Worklist))
12664 continue;
12665
12667 Ops.push_back(LD->getOperand(0)); // Chain
12668 if (IsLaneOp) {
12669 Ops.push_back(Vector); // The vector to be inserted
12670 Ops.push_back(Lane); // The lane to be inserted in the vector
12671 }
12672 Ops.push_back(Addr);
12673 Ops.push_back(Inc);
12674
12675 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
12676 SDVTList SDTys = DAG.getVTList(Tys);
12679 MemVT,
12680 LoadSDN->getMemOperand());
12681
12682 // Update the uses.
12683 SDValue NewResults[] = {
12684 SDValue(LD, 0), // The result of load
12685 SDValue(UpdN.getNode(), 2) // Chain
12686 };
12687 DCI.CombineTo(LD, NewResults);
12688 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
12689 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
12690
12691 break;
12692 }
12693 return SDValue();
12694}
12695
12696/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
12697/// address translation.
12700 SelectionDAG &DAG) {
12702 KnownBits Known;
12703 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
12704 !DCI.isBeforeLegalizeOps());
12705 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12706 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
12707 DCI.CommitTargetLoweringOpt(TLO);
12708 return true;
12709 }
12710 return false;
12711}
12712
12715 SelectionDAG &DAG,
12716 const AArch64Subtarget *Subtarget) {
12717 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
12718 return Split;
12719
12720 if (Subtarget->supportsAddressTopByteIgnored() &&
12721 performTBISimplification(N->getOperand(2), DCI, DAG))
12722 return SDValue(N, 0);
12723
12724 return SDValue();
12725}
12726
12727
12728/// Target-specific DAG combine function for NEON load/store intrinsics
12729/// to merge base address updates.
12732 SelectionDAG &DAG) {
12733 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
12734 return SDValue();
12735
12736 unsigned AddrOpIdx = N->getNumOperands() - 1;
12737 SDValue Addr = N->getOperand(AddrOpIdx);
12738
12739 // Search for a use of the address operand that is an increment.
12740 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
12741 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
12742 SDNode *User = *UI;
12743 if (User->getOpcode() != ISD::ADD ||
12744 UI.getUse().getResNo() != Addr.getResNo())
12745 continue;
12746
12747 // Check that the add is independent of the load/store. Otherwise, folding
12748 // it would create a cycle.
12751 Visited.insert(Addr.getNode());
12752 Worklist.push_back(N);
12753 Worklist.push_back(User);
12754 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
12755 SDNode::hasPredecessorHelper(User, Visited, Worklist))
12756 continue;
12757
12758 // Find the new opcode for the updating load/store.
12759 bool IsStore = false;
12760 bool IsLaneOp = false;
12761 bool IsDupOp = false;
12762 unsigned NewOpc = 0;
12763 unsigned NumVecs = 0;
12764 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
12765 switch (IntNo) {
12766 default: llvm_unreachable("unexpected intrinsic for Neon base update");
12767 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
12768 NumVecs = 2; break;
12769 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
12770 NumVecs = 3; break;
12771 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
12772 NumVecs = 4; break;
12773 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
12774 NumVecs = 2; IsStore = true; break;
12775 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
12776 NumVecs = 3; IsStore = true; break;
12777 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
12778 NumVecs = 4; IsStore = true; break;
12779 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
12780 NumVecs = 2; break;
12781 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
12782 NumVecs = 3; break;
12783 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
12784 NumVecs = 4; break;
12785 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
12786 NumVecs = 2; IsStore = true; break;
12787 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
12788 NumVecs = 3; IsStore = true; break;
12789 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
12790 NumVecs = 4; IsStore = true; break;
12791 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
12792 NumVecs = 2; IsDupOp = true; break;
12793 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
12794 NumVecs = 3; IsDupOp = true; break;
12795 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
12796 NumVecs = 4; IsDupOp = true; break;
12797 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
12798 NumVecs = 2; IsLaneOp = true; break;
12799 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
12800 NumVecs = 3; IsLaneOp = true; break;
12801 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
12802 NumVecs = 4; IsLaneOp = true; break;
12803 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
12804 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
12805 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
12806 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
12807 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
12808 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
12809 }
12810
12811 EVT VecTy;
12812 if (IsStore)
12813 VecTy = N->getOperand(2).getValueType();
12814 else
12815 VecTy = N->getValueType(0);
12816
12817 // If the increment is a constant, it must match the memory ref size.
12818 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
12820 uint32_t IncVal = CInc->getZExtValue();
12821 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
12822 if (IsLaneOp || IsDupOp)
12823 NumBytes /= VecTy.getVectorNumElements();
12824 if (IncVal != NumBytes)
12825 continue;
12826 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
12827 }
12829 Ops.push_back(N->getOperand(0)); // Incoming chain
12830 // Load lane and store have vector list as input.
12831 if (IsLaneOp || IsStore)
12832 for (unsigned i = 2; i < AddrOpIdx; ++i)
12833 Ops.push_back(N->getOperand(i));
12834 Ops.push_back(Addr); // Base register
12835 Ops.push_back(Inc);
12836
12837 // Return Types.
12838 EVT Tys[6];
12839 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
12840 unsigned n;
12841 for (n = 0; n < NumResultVecs; ++n)
12842 Tys[n] = VecTy;
12843 Tys[n++] = MVT::i64; // Type of write back register
12844 Tys[n] = MVT::Other; // Type of the chain
12846
12849 MemInt->getMemoryVT(),
12850 MemInt->getMemOperand());
12851
12852 // Update the uses.
12853 std::vector<SDValue> NewResults;
12854 for (unsigned i = 0; i < NumResultVecs; ++i) {
12855 NewResults.push_back(SDValue(UpdN.getNode(), i));
12856 }
12857 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
12858 DCI.CombineTo(N, NewResults);
12859 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
12860
12861 break;
12862 }
12863 return SDValue();
12864}
12865
12866// Checks to see if the value is the prescribed width and returns information
12867// about its extension mode.
12868static
12869bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
12870 ExtType = ISD::NON_EXTLOAD;
12871 switch(V.getNode()->getOpcode()) {
12872 default:
12873 return false;
12874 case ISD::LOAD: {
12875 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
12876 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
12877 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
12878 ExtType = LoadNode->getExtensionType();
12879 return true;
12880 }
12881 return false;
12882 }
12883 case ISD::AssertSext: {
12884 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
12885 if ((TypeNode->getVT() == MVT::i8 && width == 8)
12886 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
12887 ExtType = ISD::SEXTLOAD;
12888 return true;
12889 }
12890 return false;
12891 }
12892 case ISD::AssertZext: {
12893 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
12894 if ((TypeNode->getVT() == MVT::i8 && width == 8)
12895 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
12896 ExtType = ISD::ZEXTLOAD;
12897 return true;
12898 }
12899 return false;
12900 }
12901 case ISD::Constant:
12902 case ISD::TargetConstant: {
12903 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
12904 1LL << (width - 1);
12905 }
12906 }
12907
12908 return true;
12909}
12910
12911// This function does a whole lot of voodoo to determine if the tests are
12912// equivalent without and with a mask. Essentially what happens is that given a
12913// DAG resembling:
12914//
12915// +-------------+ +-------------+ +-------------+ +-------------+
12916// | Input | | AddConstant | | CompConstant| | CC |
12917// +-------------+ +-------------+ +-------------+ +-------------+
12918// | | | |
12919// V V | +----------+
12920// +-------------+ +----+ | |
12921// | ADD | |0xff| | |
12922// +-------------+ +----+ | |
12923// | | | |
12924// V V | |
12925// +-------------+ | |
12926// | AND | | |
12927// +-------------+ | |
12928// | | |
12929// +-----+ | |
12930// | | |
12931// V V V
12932// +-------------+
12933// | CMP |
12934// +-------------+
12935//
12936// The AND node may be safely removed for some combinations of inputs. In
12937// particular we need to take into account the extension type of the Input,
12938// the exact values of AddConstant, CompConstant, and CC, along with the nominal
12939// width of the input (this can work for any width inputs, the above graph is
12940// specific to 8 bits.
12941//
12942// The specific equations were worked out by generating output tables for each
12943// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
12944// problem was simplified by working with 4 bit inputs, which means we only
12945// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
12946// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
12947// patterns present in both extensions (0,7). For every distinct set of
12948// AddConstant and CompConstants bit patterns we can consider the masked and
12949// unmasked versions to be equivalent if the result of this function is true for
12950// all 16 distinct bit patterns of for the current extension type of Input (w0).
12951//
12952// sub w8, w0, w1
12953// and w10, w8, #0x0f
12954// cmp w8, w2
12955// cset w9, AArch64CC
12956// cmp w10, w2
12957// cset w11, AArch64CC
12958// cmp w9, w11
12959// cset w0, eq
12960// ret
12961//
12962// Since the above function shows when the outputs are equivalent it defines
12963// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
12964// would be expensive to run during compiles. The equations below were written
12965// in a test harness that confirmed they gave equivalent outputs to the above
12966// for all inputs function, so they can be used determine if the removal is
12967// legal instead.
12968//
12969// isEquivalentMaskless() is the code for testing if the AND can be removed
12970// factored out of the DAG recognition as the DAG can take several forms.
12971
12972static bool isEquivalentMaskless(unsigned CC, unsigned width,
12973 ISD::LoadExtType ExtType, int AddConstant,
12974 int CompConstant) {
12975 // By being careful about our equations and only writing the in term
12976 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
12977 // make them generally applicable to all bit widths.
12978 int MaxUInt = (1 << width);
12979
12980 // For the purposes of these comparisons sign extending the type is
12981 // equivalent to zero extending the add and displacing it by half the integer
12982 // width. Provided we are careful and make sure our equations are valid over
12983 // the whole range we can just adjust the input and avoid writing equations
12984 // for sign extended inputs.
12985 if (ExtType == ISD::SEXTLOAD)
12986 AddConstant -= (1 << (width-1));
12987
12988 switch(CC) {
12989 case AArch64CC::LE:
12990 case AArch64CC::GT:
12991 if ((AddConstant == 0) ||
12992 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
12993 (AddConstant >= 0 && CompConstant < 0) ||
12995 return true;
12996 break;
12997 case AArch64CC::LT:
12998 case AArch64CC::GE:
12999 if ((AddConstant == 0) ||
13000 (AddConstant >= 0 && CompConstant <= 0) ||
13001 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
13002 return true;
13003 break;
13004 case AArch64CC::HI:
13005 case AArch64CC::LS:
13006 if ((AddConstant >= 0 && CompConstant < 0) ||
13009 return true;
13010 break;
13011 case AArch64CC::PL:
13012 case AArch64CC::MI:
13013 if ((AddConstant == 0) ||
13014 (AddConstant > 0 && CompConstant <= 0) ||
13016 return true;
13017 break;
13018 case AArch64CC::LO:
13019 case AArch64CC::HS:
13020 if ((AddConstant >= 0 && CompConstant <= 0) ||
13023 return true;
13024 break;
13025 case AArch64CC::EQ:
13026 case AArch64CC::NE:
13027 if ((AddConstant > 0 && CompConstant < 0) ||
13030 (AddConstant >= 0 && CompConstant >= 0 &&
13033 return true;
13034 break;
13035 case AArch64CC::VS:
13036 case AArch64CC::VC:
13037 case AArch64CC::AL:
13038 case AArch64CC::NV:
13039 return true;
13040 case AArch64CC::Invalid:
13041 break;
13042 }
13043
13044 return false;
13045}
13046
13047static
13050 SelectionDAG &DAG, unsigned CCIndex,
13051 unsigned CmpIndex) {
13052 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
13053 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
13054 unsigned CondOpcode = SubsNode->getOpcode();
13055
13057 return SDValue();
13058
13059 // There is a SUBS feeding this condition. Is it fed by a mask we can
13060 // use?
13061
13062 SDNode *AndNode = SubsNode->getOperand(0).getNode();
13063 unsigned MaskBits = 0;
13064
13065 if (AndNode->getOpcode() != ISD::AND)
13066 return SDValue();
13067
13068 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
13069 uint32_t CNV = CN->getZExtValue();
13070 if (CNV == 255)
13071 MaskBits = 8;
13072 else if (CNV == 65535)
13073 MaskBits = 16;
13074 }
13075
13076 if (!MaskBits)
13077 return SDValue();
13078
13079 SDValue AddValue = AndNode->getOperand(0);
13080
13081 if (AddValue.getOpcode() != ISD::ADD)
13082 return SDValue();
13083
13084 // The basic dag structure is correct, grab the inputs and validate them.
13085
13086 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
13087 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
13088 SDValue SubsInputValue = SubsNode->getOperand(1);
13089
13090 // The mask is present and the provenance of all the values is a smaller type,
13091 // lets see if the mask is superfluous.
13092
13093 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
13095 return SDValue();
13096
13097 ISD::LoadExtType ExtType;
13098
13099 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
13102 return SDValue();
13103
13104 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
13105 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
13106 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
13107 return SDValue();
13108
13109 // The AND is not necessary, remove it.
13110
13111 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
13112 SubsNode->getValueType(1));
13113 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
13114
13115 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
13116 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
13117
13118 return SDValue(N, 0);
13119}
13120
13121// Optimize compare with zero and branch.
13124 SelectionDAG &DAG) {
13126 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
13127 // will not be produced, as they are conditional branch instructions that do
13128 // not set flags.
13129 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
13130 return SDValue();
13131
13132 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
13133 N = NV.getNode();
13134 SDValue Chain = N->getOperand(0);
13135 SDValue Dest = N->getOperand(1);
13136 SDValue CCVal = N->getOperand(2);
13137 SDValue Cmp = N->getOperand(3);
13138
13139 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
13140 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
13141 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
13142 return SDValue();
13143
13144 unsigned CmpOpc = Cmp.getOpcode();
13146 return SDValue();
13147
13148 // Only attempt folding if there is only one use of the flag and no use of the
13149 // value.
13150 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
13151 return SDValue();
13152
13153 SDValue LHS = Cmp.getOperand(0);
13154 SDValue RHS = Cmp.getOperand(1);
13155
13156 assert(LHS.getValueType() == RHS.getValueType() &&
13157 "Expected the value type to be the same for both operands!");
13158 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
13159 return SDValue();
13160
13161 if (isNullConstant(LHS))
13162 std::swap(LHS, RHS);
13163
13164 if (!isNullConstant(RHS))
13165 return SDValue();
13166
13167 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
13168 LHS.getOpcode() == ISD::SRL)
13169 return SDValue();
13170
13171 // Fold the compare into the branch instruction.
13172 SDValue BR;
13173 if (CC == AArch64CC::EQ)
13174 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
13175 else
13176 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
13177
13178 // Do not add new nodes to DAG combiner worklist.
13179 DCI.CombineTo(N, BR, false);
13180
13181 return SDValue();
13182}
13183
13184// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
13185// as well as whether the test should be inverted. This code is required to
13186// catch these cases (as opposed to standard dag combines) because
13187// AArch64ISD::TBZ is matched during legalization.
13188static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
13189 SelectionDAG &DAG) {
13190
13191 if (!Op->hasOneUse())
13192 return Op;
13193
13194 // We don't handle undef/constant-fold cases below, as they should have
13195 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
13196 // etc.)
13197
13198 // (tbz (trunc x), b) -> (tbz x, b)
13199 // This case is just here to enable more of the below cases to be caught.
13200 if (Op->getOpcode() == ISD::TRUNCATE &&
13201 Bit < Op->getValueType(0).getSizeInBits()) {
13202 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
13203 }
13204
13205 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
13206 if (Op->getOpcode() == ISD::ANY_EXTEND &&
13207 Bit < Op->getOperand(0).getValueSizeInBits()) {
13208 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
13209 }
13210
13211 if (Op->getNumOperands() != 2)
13212 return Op;
13213
13214 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13215 if (!C)
13216 return Op;
13217
13218 switch (Op->getOpcode()) {
13219 default:
13220 return Op;
13221
13222 // (tbz (and x, m), b) -> (tbz x, b)
13223 case ISD::AND:
13224 if ((C->getZExtValue() >> Bit) & 1)
13225 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
13226 return Op;
13227
13228 // (tbz (shl x, c), b) -> (tbz x, b-c)
13229 case ISD::SHL:
13230 if (C->getZExtValue() <= Bit &&
13231 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
13232 Bit = Bit - C->getZExtValue();
13233 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
13234 }
13235 return Op;
13236
13237 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
13238 case ISD::SRA:
13239 Bit = Bit + C->getZExtValue();
13240 if (Bit >= Op->getValueType(0).getSizeInBits())
13241 Bit = Op->getValueType(0).getSizeInBits() - 1;
13242 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
13243
13244 // (tbz (srl x, c), b) -> (tbz x, b+c)
13245 case ISD::SRL:
13246 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
13247 Bit = Bit + C->getZExtValue();
13248 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
13249 }
13250 return Op;
13251
13252 // (tbz (xor x, -1), b) -> (tbnz x, b)
13253 case ISD::XOR:
13254 if ((C->getZExtValue() >> Bit) & 1)
13255 Invert = !Invert;
13256 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
13257 }
13258}
13259
13260// Optimize test single bit zero/non-zero and branch.
13263 SelectionDAG &DAG) {
13264 unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
13265 bool Invert = false;
13266 SDValue TestSrc = N->getOperand(1);
13268
13269 if (TestSrc == NewTestSrc)
13270 return SDValue();
13271
13272 unsigned NewOpc = N->getOpcode();
13273 if (Invert) {
13274 if (NewOpc == AArch64ISD::TBZ)
13276 else {
13279 }
13280 }
13281
13282 SDLoc DL(N);
13283 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
13284 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
13285}
13286
13287// vselect (v1i1 setcc) ->
13288// vselect (v1iXX setcc) (XX is the size of the compared operand type)
13289// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
13290// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
13291// such VSELECT.
13293 SDValue N0 = N->getOperand(0);
13294 EVT CCVT = N0.getValueType();
13295
13296 if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
13297 CCVT.getVectorElementType() != MVT::i1)
13298 return SDValue();
13299
13300 EVT ResVT = N->getValueType(0);
13301 EVT CmpVT = N0.getOperand(0).getValueType();
13302 // Only combine when the result type is of the same size as the compared
13303 // operands.
13304 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
13305 return SDValue();
13306
13307 SDValue IfTrue = N->getOperand(1);
13308 SDValue IfFalse = N->getOperand(2);
13309 SDValue SetCC =
13310 DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
13311 N0.getOperand(0), N0.getOperand(1),
13312 cast<CondCodeSDNode>(N0.getOperand(2))->get());
13313 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
13314 IfTrue, IfFalse);
13315}
13316
13317/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
13318/// the compare-mask instructions rather than going via NZCV, even if LHS and
13319/// RHS are really scalar. This replaces any scalar setcc in the above pattern
13320/// with a vector one followed by a DUP shuffle on the result.
13323 SelectionDAG &DAG = DCI.DAG;
13324 SDValue N0 = N->getOperand(0);
13325 EVT ResVT = N->getValueType(0);
13326
13327 if (N0.getOpcode() != ISD::SETCC)
13328 return SDValue();
13329
13330 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
13331 // scalar SetCCResultType. We also don't expect vectors, because we assume
13332 // that selects fed by vector SETCCs are canonicalized to VSELECT.
13333 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
13334 "Scalar-SETCC feeding SELECT has unexpected result type!");
13335
13336 // If NumMaskElts == 0, the comparison is larger than select result. The
13337 // largest real NEON comparison is 64-bits per lane, which means the result is
13338 // at most 32-bits and an illegal vector. Just bail out for now.
13339 EVT SrcVT = N0.getOperand(0).getValueType();
13340
13341 // Don't try to do this optimization when the setcc itself has i1 operands.
13342 // There are no legal vectors of i1, so this would be pointless.
13343 if (SrcVT == MVT::i1)
13344 return SDValue();
13345
13346 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
13347 if (!ResVT.isVector() || NumMaskElts == 0)
13348 return SDValue();
13349
13351 EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
13352
13353 // Also bail out if the vector CCVT isn't the same size as ResVT.
13354 // This can happen if the SETCC operand size doesn't divide the ResVT size
13355 // (e.g., f64 vs v3f32).
13356 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
13357 return SDValue();
13358
13359 // Make sure we didn't create illegal types, if we're not supposed to.
13360 assert(DCI.isBeforeLegalize() ||
13362
13363 // First perform a vector comparison, where lane 0 is the one we're interested
13364 // in.
13365 SDLoc DL(N0);
13366 SDValue LHS =
13368 SDValue RHS =
13370 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
13371
13372 // Now duplicate the comparison mask we want across all other lanes.
13373 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
13375 Mask = DAG.getNode(ISD::BITCAST, DL,
13376 ResVT.changeVectorElementTypeToInteger(), Mask);
13377
13378 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
13379}
13380
13381/// Get rid of unnecessary NVCASTs (that don't change the type).
13383 if (N->getValueType(0) == N->getOperand(0).getValueType())
13384 return N->getOperand(0);
13385
13386 return SDValue();
13387}
13388
13389// If all users of the globaladdr are of the form (globaladdr + constant), find
13390// the smallest constant, fold it into the globaladdr's offset and rewrite the
13391// globaladdr as (globaladdr + constant) - constant.
13393 const AArch64Subtarget *Subtarget,
13394 const TargetMachine &TM) {
13396 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
13398 return SDValue();
13399
13400 uint64_t MinOffset = -1ull;
13401 for (SDNode *N : GN->uses()) {
13402 if (N->getOpcode() != ISD::ADD)
13403 return SDValue();
13404 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
13405 if (!C)
13406 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
13407 if (!C)
13408 return SDValue();
13409 MinOffset = std::min(MinOffset, C->getZExtValue());
13410 }
13411 uint64_t Offset = MinOffset + GN->getOffset();
13412
13413 // Require that the new offset is larger than the existing one. Otherwise, we
13414 // can end up oscillating between two possible DAGs, for example,
13415 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
13416 if (Offset <= uint64_t(GN->getOffset()))
13417 return SDValue();
13418
13419 // Check whether folding this offset is legal. It must not go out of bounds of
13420 // the referenced object to avoid violating the code model, and must be
13421 // smaller than 2^21 because this is the largest offset expressible in all
13422 // object formats.
13423 //
13424 // This check also prevents us from folding negative offsets, which will end
13425 // up being treated in the same way as large positive ones. They could also
13426 // cause code model violations, and aren't really common enough to matter.
13427 if (Offset >= (1 << 21))
13428 return SDValue();
13429
13430 const GlobalValue *GV = GN->getGlobal();
13431 Type *T = GV->getValueType();
13432 if (!T->isSized() ||
13433 Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
13434 return SDValue();
13435
13436 SDLoc DL(GN);
13437 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
13438 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
13439 DAG.getConstant(MinOffset, DL, MVT::i64));
13440}
13441
13442// Turns the vector of indices into a vector of byte offstes by scaling Offset
13443// by (BitWidth / 8).
13445 SDLoc DL, unsigned BitWidth) {
13446 assert(Offset.getValueType().isScalableVector() &&
13447 "This method is only for scalable vectors of offsets");
13448
13449 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
13451
13452 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
13453}
13454
13455/// Check if the value of \p OffsetInBytes can be used as an immediate for
13456/// the gather load/prefetch and scatter store instructions with vector base and
13457/// immediate offset addressing mode:
13458///
13459/// [<Zn>.[S|D]{, #<imm>}]
13460///
13461/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
13462
13464 unsigned ScalarSizeInBytes) {
13465 // The immediate is not a multiple of the scalar size.
13467 return false;
13468
13469 // The immediate is out of range.
13471 return false;
13472
13473 return true;
13474}
13475
13476/// Check if the value of \p Offset represents a valid immediate for the SVE
13477/// gather load/prefetch and scatter store instructiona with vector base and
13478/// immediate offset addressing mode:
13479///
13480/// [<Zn>.[S|D]{, #<imm>}]
13481///
13482/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
13484 unsigned ScalarSizeInBytes) {
13487 OffsetConst->getZExtValue(), ScalarSizeInBytes);
13488}
13489
13491 unsigned Opcode,
13492 bool OnlyPackedOffsets = true) {
13493 const SDValue Src = N->getOperand(2);
13494 const EVT SrcVT = Src->getValueType(0);
13495 assert(SrcVT.isScalableVector() &&
13496 "Scatter stores are only possible for SVE vectors");
13497
13498 SDLoc DL(N);
13499 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
13500
13501 // Make sure that source data will fit into an SVE register
13502 if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
13503 return SDValue();
13504
13505 // For FPs, ACLE only supports _packed_ single and double precision types.
13506 if (SrcElVT.isFloatingPoint())
13507 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
13508 return SDValue();
13509
13510 // Depending on the addressing mode, this is either a pointer or a vector of
13511 // pointers (that fits into one register)
13512 SDValue Base = N->getOperand(4);
13513 // Depending on the addressing mode, this is either a single offset or a
13514 // vector of offsets (that fits into one register)
13515 SDValue Offset = N->getOperand(5);
13516
13517 // For "scalar + vector of indices", just scale the indices. This only
13518 // applies to non-temporal scatters because there's no instruction that takes
13519 // indicies.
13520 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
13521 Offset =
13522 getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
13523 Opcode = AArch64ISD::SSTNT1_PRED;
13524 }
13525
13526 // In the case of non-temporal gather loads there's only one SVE instruction
13527 // per data-size: "scalar + vector", i.e.
13528 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
13529 // Since we do have intrinsics that allow the arguments to be in a different
13530 // order, we may need to swap them to match the spec.
13531 if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
13532 std::swap(Base, Offset);
13533
13534 // SST1_IMM requires that the offset is an immediate that is:
13535 // * a multiple of #SizeInBytes,
13536 // * in the range [0, 31 x #SizeInBytes],
13537 // where #SizeInBytes is the size in bytes of the stored items. For
13538 // immediates outside that range and non-immediate scalar offsets use SST1 or
13539 // SST1_UXTW instead.
13540 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
13542 SrcVT.getScalarSizeInBits() / 8)) {
13545 else
13546 Opcode = AArch64ISD::SST1_PRED;
13547
13548 std::swap(Base, Offset);
13549 }
13550 }
13551
13552 auto &TLI = DAG.getTargetLoweringInfo();
13553 if (!TLI.isTypeLegal(Base.getValueType()))
13554 return SDValue();
13555
13556 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
13557 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
13558 // nxv2i64. Legalize accordingly.
13559 if (!OnlyPackedOffsets &&
13560 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
13561 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
13562
13563 if (!TLI.isTypeLegal(Offset.getValueType()))
13564 return SDValue();
13565
13566 // Source value type that is representable in hardware
13568
13569 // Keep the original type of the input data to store - this is needed to be
13570 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
13571 // FP values we want the integer equivalent, so just use HwSrcVt.
13573 if (SrcVT.isFloatingPoint())
13575
13576 SDVTList VTs = DAG.getVTList(MVT::Other);
13578
13579 if (Src.getValueType().isFloatingPoint())
13580 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
13581 else
13582 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
13583
13584 SDValue Ops[] = {N->getOperand(0), // Chain
13585 SrcNew,
13586 N->getOperand(3), // Pg
13587 Base,
13588 Offset,
13589 InputVT};
13590
13591 return DAG.getNode(Opcode, DL, VTs, Ops);
13592}
13593
13595 unsigned Opcode,
13596 bool OnlyPackedOffsets = true) {
13597 const EVT RetVT = N->getValueType(0);
13598 assert(RetVT.isScalableVector() &&
13599 "Gather loads are only possible for SVE vectors");
13600
13601 SDLoc DL(N);
13602
13603 // Make sure that the loaded data will fit into an SVE register
13604 if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
13605 return SDValue();
13606
13607 // Depending on the addressing mode, this is either a pointer or a vector of
13608 // pointers (that fits into one register)
13609 SDValue Base = N->getOperand(3);
13610 // Depending on the addressing mode, this is either a single offset or a
13611 // vector of offsets (that fits into one register)
13612 SDValue Offset = N->getOperand(4);
13613
13614 // For "scalar + vector of indices", just scale the indices. This only
13615 // applies to non-temporal gathers because there's no instruction that takes
13616 // indicies.
13618 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
13619 RetVT.getScalarSizeInBits());
13621 }
13622
13623 // In the case of non-temporal gather loads there's only one SVE instruction
13624 // per data-size: "scalar + vector", i.e.
13625 // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
13626 // Since we do have intrinsics that allow the arguments to be in a different
13627 // order, we may need to swap them to match the spec.
13628 if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
13629 Offset.getValueType().isVector())
13630 std::swap(Base, Offset);
13631
13632 // GLD{FF}1_IMM requires that the offset is an immediate that is:
13633 // * a multiple of #SizeInBytes,
13634 // * in the range [0, 31 x #SizeInBytes],
13635 // where #SizeInBytes is the size in bytes of the loaded items. For
13636 // immediates outside that range and non-immediate scalar offsets use
13637 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
13638 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
13641 RetVT.getScalarSizeInBits() / 8)) {
13643 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
13646 else
13647 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
13650
13651 std::swap(Base, Offset);
13652 }
13653 }
13654
13655 auto &TLI = DAG.getTargetLoweringInfo();
13656 if (!TLI.isTypeLegal(Base.getValueType()))
13657 return SDValue();
13658
13659 // Some gather load variants allow unpacked offsets, but only as nxv2i32
13660 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
13661 // nxv2i64. Legalize accordingly.
13662 if (!OnlyPackedOffsets &&
13663 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
13664 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
13665
13666 // Return value type that is representable in hardware
13668
13669 // Keep the original output value type around - this is needed to be able to
13670 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
13671 // values we want the integer equivalent, so just use HwRetVT.
13673 if (RetVT.isFloatingPoint())
13674 OutVT = DAG.getValueType(HwRetVt);
13675
13677 SDValue Ops[] = {N->getOperand(0), // Chain
13678 N->getOperand(2), // Pg
13679 Base, Offset, OutVT};
13680
13681 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
13682 SDValue LoadChain = SDValue(Load.getNode(), 1);
13683
13684 if (RetVT.isInteger() && (RetVT != HwRetVt))
13685 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
13686
13687 // If the original return value was FP, bitcast accordingly. Doing it here
13688 // means that we can avoid adding TableGen patterns for FPs.
13689 if (RetVT.isFloatingPoint())
13690 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
13691
13692 return DAG.getMergeValues({Load, LoadChain}, DL);
13693}
13694
13695static SDValue
13697 SelectionDAG &DAG) {
13698 if (DCI.isBeforeLegalizeOps())
13699 return SDValue();
13700
13701 SDLoc DL(N);
13702 SDValue Src = N->getOperand(0);
13703 unsigned Opc = Src->getOpcode();
13704
13705 // Sign extend of an unsigned unpack -> signed unpack
13706 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
13707
13708 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
13710
13711 // Push the sign extend to the operand of the unpack
13712 // This is necessary where, for example, the operand of the unpack
13713 // is another unpack:
13714 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
13715 // ->
13716 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
13717 // ->
13718 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
13719 SDValue ExtOp = Src->getOperand(0);
13720 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
13721 EVT EltTy = VT.getVectorElementType();
13722 (void)EltTy;
13723
13724 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
13725 "Sign extending from an invalid type");
13726
13729 VT.getVectorElementCount() * 2);
13730
13731 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
13732 ExtOp, DAG.getValueType(ExtVT));
13733
13734 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
13735 }
13736
13737 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
13738 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
13739 unsigned NewOpc;
13740 unsigned MemVTOpNum = 4;
13741 switch (Opc) {
13744 MemVTOpNum = 3;
13745 break;
13748 MemVTOpNum = 3;
13749 break;
13752 MemVTOpNum = 3;
13753 break;
13756 break;
13759 break;
13762 break;
13765 break;
13768 break;
13771 break;
13774 break;
13777 break;
13780 break;
13783 break;
13786 break;
13789 break;
13792 break;
13795 break;
13798 break;
13799 default:
13800 return SDValue();
13801 }
13802
13803 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
13805
13806 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
13807 return SDValue();
13808
13809 EVT DstVT = N->getValueType(0);
13810 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
13811
13813 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
13814 Ops.push_back(Src->getOperand(I));
13815
13816 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
13817 DCI.CombineTo(N, ExtLoad);
13818 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
13819
13820 // Return N so it doesn't get rechecked
13821 return SDValue(N, 0);
13822}
13823
13824/// Legalize the gather prefetch (scalar + vector addressing mode) when the
13825/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
13826/// != nxv2i32) do not need legalization.
13828 const unsigned OffsetPos = 4;
13829 SDValue Offset = N->getOperand(OffsetPos);
13830
13831 // Not an unpacked vector, bail out.
13832 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
13833 return SDValue();
13834
13835 // Extend the unpacked offset vector to 64-bit lanes.
13836 SDLoc DL(N);
13837 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
13838 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
13839 // Replace the offset operand with the 64-bit one.
13840 Ops[OffsetPos] = Offset;
13841
13842 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
13843}
13844
13845/// Combines a node carrying the intrinsic
13846/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
13847/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
13848/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
13849/// sve gather prefetch instruction with vector plus immediate addressing mode.
13851 unsigned ScalarSizeInBytes) {
13852 const unsigned ImmPos = 4, OffsetPos = 3;
13853 // No need to combine the node if the immediate is valid...
13855 return SDValue();
13856
13857 // ...otherwise swap the offset base with the offset...
13858 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
13859 std::swap(Ops[ImmPos], Ops[OffsetPos]);
13860 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
13861 // `aarch64_sve_prfb_gather_uxtw_index`.
13862 SDLoc DL(N);
13863 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
13864 MVT::i64);
13865
13866 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
13867}
13868
13870 DAGCombinerInfo &DCI) const {
13871 SelectionDAG &DAG = DCI.DAG;
13872 switch (N->getOpcode()) {
13873 default:
13874 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
13875 break;
13876 case ISD::ADD:
13877 case ISD::SUB:
13878 return performAddSubLongCombine(N, DCI, DAG);
13879 case ISD::XOR:
13880 return performXorCombine(N, DAG, DCI, Subtarget);
13881 case ISD::MUL:
13882 return performMulCombine(N, DAG, DCI, Subtarget);
13883 case ISD::SINT_TO_FP:
13884 case ISD::UINT_TO_FP:
13885 return performIntToFpCombine(N, DAG, Subtarget);
13886 case ISD::FP_TO_SINT:
13887 case ISD::FP_TO_UINT:
13888 return performFpToIntCombine(N, DAG, DCI, Subtarget);
13889 case ISD::FDIV:
13890 return performFDivCombine(N, DAG, DCI, Subtarget);
13891 case ISD::OR:
13892 return performORCombine(N, DCI, Subtarget);
13893 case ISD::AND:
13894 return performANDCombine(N, DCI);
13895 case ISD::SRL:
13896 return performSRLCombine(N, DCI);
13898 return performIntrinsicCombine(N, DCI, Subtarget);
13899 case ISD::ANY_EXTEND:
13900 case ISD::ZERO_EXTEND:
13901 case ISD::SIGN_EXTEND:
13902 return performExtendCombine(N, DCI, DAG);
13904 return performSignExtendInRegCombine(N, DCI, DAG);
13906 return performConcatVectorsCombine(N, DCI, DAG);
13907 case ISD::SELECT:
13908 return performSelectCombine(N, DCI);
13909 case ISD::VSELECT:
13910 return performVSelectCombine(N, DCI.DAG);
13911 case ISD::LOAD:
13912 if (performTBISimplification(N->getOperand(1), DCI, DAG))
13913 return SDValue(N, 0);
13914 break;
13915 case ISD::STORE:
13916 return performSTORECombine(N, DCI, DAG, Subtarget);
13917 case AArch64ISD::BRCOND:
13918 return performBRCONDCombine(N, DCI, DAG);
13919 case AArch64ISD::TBNZ:
13920 case AArch64ISD::TBZ:
13921 return performTBZCombine(N, DCI, DAG);
13922 case AArch64ISD::CSEL:
13923 return performCONDCombine(N, DCI, DAG, 2, 3);
13924 case AArch64ISD::DUP:
13925 return performPostLD1Combine(N, DCI, false);
13926 case AArch64ISD::NVCAST:
13927 return performNVCASTCombine(N);
13929 return performPostLD1Combine(N, DCI, true);
13932 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
13933 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
13934 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
13935 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
13936 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
13937 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
13938 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
13939 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
13940 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
13941 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
13942 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
13943 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
13944 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
13945 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
13946 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
13947 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
13948 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
13950 case Intrinsic::aarch64_neon_ld2:
13951 case Intrinsic::aarch64_neon_ld3:
13952 case Intrinsic::aarch64_neon_ld4:
13953 case Intrinsic::aarch64_neon_ld1x2:
13954 case Intrinsic::aarch64_neon_ld1x3:
13955 case Intrinsic::aarch64_neon_ld1x4:
13956 case Intrinsic::aarch64_neon_ld2lane:
13957 case Intrinsic::aarch64_neon_ld3lane:
13958 case Intrinsic::aarch64_neon_ld4lane:
13959 case Intrinsic::aarch64_neon_ld2r:
13960 case Intrinsic::aarch64_neon_ld3r:
13961 case Intrinsic::aarch64_neon_ld4r:
13962 case Intrinsic::aarch64_neon_st2:
13963 case Intrinsic::aarch64_neon_st3:
13964 case Intrinsic::aarch64_neon_st4:
13965 case Intrinsic::aarch64_neon_st1x2:
13966 case Intrinsic::aarch64_neon_st1x3:
13967 case Intrinsic::aarch64_neon_st1x4:
13968 case Intrinsic::aarch64_neon_st2lane:
13969 case Intrinsic::aarch64_neon_st3lane:
13970 case Intrinsic::aarch64_neon_st4lane:
13971 return performNEONPostLDSTCombine(N, DCI, DAG);
13972 case Intrinsic::aarch64_sve_ldnt1:
13973 return performLDNT1Combine(N, DAG);
13974 case Intrinsic::aarch64_sve_ld1rq:
13976 case Intrinsic::aarch64_sve_ld1ro:
13978 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
13980 case Intrinsic::aarch64_sve_ldnt1_gather:
13982 case Intrinsic::aarch64_sve_ldnt1_gather_index:
13983 return performGatherLoadCombine(N, DAG,
13985 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
13987 case Intrinsic::aarch64_sve_ld1:
13989 case Intrinsic::aarch64_sve_ldnf1:
13991 case Intrinsic::aarch64_sve_ldff1:
13993 case Intrinsic::aarch64_sve_st1:
13994 return performST1Combine(N, DAG);
13995 case Intrinsic::aarch64_sve_stnt1:
13996 return performSTNT1Combine(N, DAG);
13997 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
13999 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
14001 case Intrinsic::aarch64_sve_stnt1_scatter:
14003 case Intrinsic::aarch64_sve_stnt1_scatter_index:
14005 case Intrinsic::aarch64_sve_ld1_gather:
14007 case Intrinsic::aarch64_sve_ld1_gather_index:
14008 return performGatherLoadCombine(N, DAG,
14010 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
14012 /*OnlyPackedOffsets=*/false);
14013 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
14015 /*OnlyPackedOffsets=*/false);
14016 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
14017 return performGatherLoadCombine(N, DAG,
14019 /*OnlyPackedOffsets=*/false);
14020 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
14021 return performGatherLoadCombine(N, DAG,
14023 /*OnlyPackedOffsets=*/false);
14024 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
14026 case Intrinsic::aarch64_sve_ldff1_gather:
14028 case Intrinsic::aarch64_sve_ldff1_gather_index:
14029 return performGatherLoadCombine(N, DAG,
14031 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
14032 return performGatherLoadCombine(N, DAG,
14034 /*OnlyPackedOffsets=*/false);
14035 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
14036 return performGatherLoadCombine(N, DAG,
14038 /*OnlyPackedOffsets=*/false);
14039 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
14040 return performGatherLoadCombine(N, DAG,
14042 /*OnlyPackedOffsets=*/false);
14043 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
14044 return performGatherLoadCombine(N, DAG,
14046 /*OnlyPackedOffsets=*/false);
14047 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
14048 return performGatherLoadCombine(N, DAG,
14050 case Intrinsic::aarch64_sve_st1_scatter:
14052 case Intrinsic::aarch64_sve_st1_scatter_index:
14054 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
14056 /*OnlyPackedOffsets=*/false);
14057 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
14059 /*OnlyPackedOffsets=*/false);
14060 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
14061 return performScatterStoreCombine(N, DAG,
14063 /*OnlyPackedOffsets=*/false);
14064 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
14065 return performScatterStoreCombine(N, DAG,
14067 /*OnlyPackedOffsets=*/false);
14068 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
14070 case Intrinsic::aarch64_sve_tuple_get: {
14071 SDLoc DL(N);
14072 SDValue Chain = N->getOperand(0);
14073 SDValue Src1 = N->getOperand(2);
14074 SDValue Idx = N->getOperand(3);
14075
14076 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
14077 EVT ResVT = N->getValueType(0);
14078 uint64_t NumLanes = ResVT.getVectorElementCount().Min;
14079 SDValue Val =
14082 return DAG.getMergeValues({Val, Chain}, DL);
14083 }
14084 case Intrinsic::aarch64_sve_tuple_set: {
14085 SDLoc DL(N);
14086 SDValue Chain = N->getOperand(0);
14087 SDValue Tuple = N->getOperand(2);
14088 SDValue Idx = N->getOperand(3);
14089 SDValue Vec = N->getOperand(4);
14090
14091 EVT TupleVT = Tuple.getValueType();
14092 uint64_t TupleLanes = TupleVT.getVectorElementCount().Min;
14093
14094 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
14096
14097 if ((TupleLanes % NumLanes) != 0)
14098 report_fatal_error("invalid tuple vector!");
14099
14100 uint64_t NumVecs = TupleLanes / NumLanes;
14101
14103 for (unsigned I = 0; I < NumVecs; ++I) {
14104 if (I == IdxConst)
14105 Opnds.push_back(Vec);
14106 else {
14107 Opnds.push_back(
14108 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple,
14109 DAG.getConstant(I * NumLanes, DL, MVT::i32)));
14110 }
14111 }
14112 SDValue Concat =
14113 DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
14114 return DAG.getMergeValues({Concat, Chain}, DL);
14115 }
14116 case Intrinsic::aarch64_sve_tuple_create2:
14117 case Intrinsic::aarch64_sve_tuple_create3:
14118 case Intrinsic::aarch64_sve_tuple_create4: {
14119 SDLoc DL(N);
14120 SDValue Chain = N->getOperand(0);
14121
14123 for (unsigned I = 2; I < N->getNumOperands(); ++I)
14124 Opnds.push_back(N->getOperand(I));
14125
14126 EVT VT = Opnds[0].getValueType();
14130 (N->getNumOperands() - 2));
14132 return DAG.getMergeValues({Concat, Chain}, DL);
14133 }
14134 case Intrinsic::aarch64_sve_ld2:
14135 case Intrinsic::aarch64_sve_ld3:
14136 case Intrinsic::aarch64_sve_ld4: {
14137 SDLoc DL(N);
14138 SDValue Chain = N->getOperand(0);
14139 SDValue Mask = N->getOperand(2);
14140 SDValue BasePtr = N->getOperand(3);
14141 SDValue LoadOps[] = {Chain, Mask, BasePtr};
14142 unsigned IntrinsicID =
14143 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
14144 SDValue Result =
14145 LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
14146 return DAG.getMergeValues({Result, Chain}, DL);
14147 }
14148 default:
14149 break;
14150 }
14151 break;
14152 case ISD::GlobalAddress:
14153 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
14154 }
14155 return SDValue();
14156}
14157
14158// Check if the return value is used as only a return value, as otherwise
14159// we can't perform a tail-call. In particular, we need to check for
14160// target ISD nodes that are returns and any other "odd" constructs
14161// that the generic analysis code won't necessarily catch.
14162bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
14163 SDValue &Chain) const {
14164 if (N->getNumValues() != 1)
14165 return false;
14166 if (!N->hasNUsesOfValue(1, 0))
14167 return false;
14168
14169 SDValue TCChain = Chain;
14170 SDNode *Copy = *N->use_begin();
14171 if (Copy->getOpcode() == ISD::CopyToReg) {
14172 // If the copy has a glue operand, we conservatively assume it isn't safe to
14173 // perform a tail call.
14174 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
14175 MVT::Glue)
14176 return false;
14177 TCChain = Copy->getOperand(0);
14178 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
14179 return false;
14180
14181 bool HasRet = false;
14182 for (SDNode *Node : Copy->uses()) {
14183 if (Node->getOpcode() != AArch64ISD::RET_FLAG)
14184 return false;
14185 HasRet = true;
14186 }
14187
14188 if (!HasRet)
14189 return false;
14190
14191 Chain = TCChain;
14192 return true;
14193}
14194
14195// Return whether the an instruction can potentially be optimized to a tail
14196// call. This will cause the optimizers to attempt to move, or duplicate,
14197// return instructions to help enable tail call optimizations for this
14198// instruction.
14199bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
14200 return CI->isTailCall();
14201}
14202
14203bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
14204 SDValue &Offset,
14206 bool &IsInc,
14207 SelectionDAG &DAG) const {
14208 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
14209 return false;
14210
14211 Base = Op->getOperand(0);
14212 // All of the indexed addressing mode instructions take a signed
14213 // 9 bit immediate offset.
14214 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
14215 int64_t RHSC = RHS->getSExtValue();
14216 if (Op->getOpcode() == ISD::SUB)
14217 RHSC = -(uint64_t)RHSC;
14218 if (!isInt<9>(RHSC))
14219 return false;
14220 IsInc = (Op->getOpcode() == ISD::ADD);
14221 Offset = Op->getOperand(1);
14222 return true;
14223 }
14224 return false;
14225}
14226
14227bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
14228 SDValue &Offset,
14230 SelectionDAG &DAG) const {
14231 EVT VT;
14232 SDValue Ptr;
14233 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
14234 VT = LD->getMemoryVT();
14235 Ptr = LD->getBasePtr();
14236 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
14237 VT = ST->getMemoryVT();
14238 Ptr = ST->getBasePtr();
14239 } else
14240 return false;
14241
14242 bool IsInc;
14243 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
14244 return false;
14246 return true;
14247}
14248
14249bool AArch64TargetLowering::getPostIndexedAddressParts(
14250 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
14251 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
14252 EVT VT;
14253 SDValue Ptr;
14254 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
14255 VT = LD->getMemoryVT();
14256 Ptr = LD->getBasePtr();
14257 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
14258 VT = ST->getMemoryVT();
14259 Ptr = ST->getBasePtr();
14260 } else
14261 return false;
14262
14263 bool IsInc;
14264 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
14265 return false;
14266 // Post-indexing updates the base, so it's not a valid transform
14267 // if that's not the same as the load's pointer.
14268 if (Ptr != Base)
14269 return false;
14271 return true;
14272}
14273
14275 SelectionDAG &DAG) {
14276 SDLoc DL(N);
14277 SDValue Op = N->getOperand(0);
14278
14279 if (N->getValueType(0) != MVT::i16 ||
14280 (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16))
14281 return;
14282
14283 Op = SDValue(
14284 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
14285 DAG.getUNDEF(MVT::i32), Op,
14286 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
14287 0);
14288 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
14289 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
14290}
14291
14294 SelectionDAG &DAG, unsigned InterOp,
14295 unsigned AcrossOp) {
14296 EVT LoVT, HiVT;
14297 SDValue Lo, Hi;
14298 SDLoc dl(N);
14299 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
14300 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
14301 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
14303 Results.push_back(SplitVal);
14304}
14305
14306static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
14307 SDLoc DL(N);
14310 DAG.getNode(ISD::SRL, DL, MVT::i128, N,
14311 DAG.getConstant(64, DL, MVT::i64)));
14312 return std::make_pair(Lo, Hi);
14313}
14314
14315void AArch64TargetLowering::ReplaceExtractSubVectorResults(
14317 SDValue In = N->getOperand(0);
14318 EVT InVT = In.getValueType();
14319
14320 // Common code will handle these just fine.
14321 if (!InVT.isScalableVector() || !InVT.isInteger())
14322 return;
14323
14324 SDLoc DL(N);
14325 EVT VT = N->getValueType(0);
14326
14327 // The following checks bail if this is not a halving operation.
14328
14330
14331 if (InVT.getVectorElementCount().Min != (ResEC.Min * 2))
14332 return;
14333
14334 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
14335 if (!CIndex)
14336 return;
14337
14338 unsigned Index = CIndex->getZExtValue();
14339 if ((Index != 0) && (Index != ResEC.Min))
14340 return;
14341
14342 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
14344
14345 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
14346 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
14347}
14348
14349// Create an even/odd pair of X registers holding integer value V.
14351 SDLoc dl(V.getNode());
14352 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
14354 DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
14355 dl, MVT::i64);
14356 if (DAG.getDataLayout().isBigEndian())
14357 std::swap (VLo, VHi);
14358 SDValue RegClass =
14359 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
14360 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
14361 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
14362 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
14363 return SDValue(
14364 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
14365}
14366
14369 SelectionDAG &DAG,
14370 const AArch64Subtarget *Subtarget) {
14371 assert(N->getValueType(0) == MVT::i128 &&
14372 "AtomicCmpSwap on types less than 128 should be legal");
14373
14374 if (Subtarget->hasLSE()) {
14375 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
14376 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
14377 SDValue Ops[] = {
14378 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
14379 createGPRPairNode(DAG, N->getOperand(3)), // Store value
14380 N->getOperand(1), // Ptr
14381 N->getOperand(0), // Chain in
14382 };
14383
14384 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
14385
14386 unsigned Opcode;
14387 switch (MemOp->getOrdering()) {
14389 Opcode = AArch64::CASPX;
14390 break;
14392 Opcode = AArch64::CASPAX;
14393 break;
14395 Opcode = AArch64::CASPLX;
14396 break;
14399 Opcode = AArch64::CASPALX;
14400 break;
14401 default:
14402 llvm_unreachable("Unexpected ordering!");
14403 }
14404
14406 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
14408
14409 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
14410 if (DAG.getDataLayout().isBigEndian())
14413 SDValue(CmpSwap, 0));
14415 SDValue(CmpSwap, 0));
14416 Results.push_back(
14417 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
14418 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
14419 return;
14420 }
14421
14422 auto Desired = splitInt128(N->getOperand(2), DAG);
14423 auto New = splitInt128(N->getOperand(3), DAG);
14424 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
14425 New.first, New.second, N->getOperand(0)};
14427 AArch64::CMP_SWAP_128, SDLoc(N),
14429
14430 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
14432
14434 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
14435 Results.push_back(SDValue(CmpSwap, 3));
14436}
14437
14438void AArch64TargetLowering::ReplaceNodeResults(
14440 switch (N->getOpcode()) {
14441 default:
14442 llvm_unreachable("Don't know how to custom expand this");
14443 case ISD::BITCAST:
14445 return;
14446 case ISD::VECREDUCE_ADD:
14451 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
14452 return;
14453
14454 case ISD::CTPOP:
14455 Results.push_back(LowerCTPOP(SDValue(N, 0), DAG));
14456 return;
14457 case AArch64ISD::SADDV:
14459 return;
14460 case AArch64ISD::UADDV:
14462 return;
14463 case AArch64ISD::SMINV:
14465 return;
14466 case AArch64ISD::UMINV:
14468 return;
14469 case AArch64ISD::SMAXV:
14471 return;
14472 case AArch64ISD::UMAXV:
14474 return;
14475 case ISD::FP_TO_UINT:
14476 case ISD::FP_TO_SINT:
14477 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
14478 // Let normal code take care of it by not adding anything to Results.
14479 return;
14481 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
14482 return;
14483 case ISD::LOAD: {
14485 "unexpected load's value type");
14487 if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
14488 // Non-volatile loads are optimized later in AArch64's load/store
14489 // optimizer.
14490 return;
14491 }
14492
14495 DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
14496 {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
14497 LoadNode->getMemOperand());
14498
14500 Result.getValue(0), Result.getValue(1));
14501 Results.append({Pair, Result.getValue(2) /* Chain */});
14502 return;
14503 }
14505 ReplaceExtractSubVectorResults(N, Results, DAG);
14506 return;
14508 EVT VT = N->getValueType(0);
14509 assert((VT == MVT::i8 || VT == MVT::i16) &&
14510 "custom lowering for unexpected type");
14511
14512 ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
14513 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
14514 switch (IntID) {
14515 default:
14516 return;
14517 case Intrinsic::aarch64_sve_clasta_n: {
14518 SDLoc DL(N);
14519 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
14520 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
14521 N->getOperand(1), Op2, N->getOperand(3));
14522 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
14523 return;
14524 }
14525 case Intrinsic::aarch64_sve_clastb_n: {
14526 SDLoc DL(N);
14527 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
14528 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
14529 N->getOperand(1), Op2, N->getOperand(3));
14530 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
14531 return;
14532 }
14533 case Intrinsic::aarch64_sve_lasta: {
14534 SDLoc DL(N);
14535 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
14536 N->getOperand(1), N->getOperand(2));
14537 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
14538 return;
14539 }
14540 case Intrinsic::aarch64_sve_lastb: {
14541 SDLoc DL(N);
14542 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
14543 N->getOperand(1), N->getOperand(2));
14544 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
14545 return;
14546 }
14547 }
14548 }
14549 }
14550}
14551
14553 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
14555 return true;
14556}
14557
14558unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
14559 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
14560 // reciprocal if there are three or more FDIVs.
14561 return 3;
14562}
14563
14566 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
14567 // v4i16, v2i32 instead of to promote.
14568 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
14569 VT == MVT::v1f32)
14570 return TypeWidenVector;
14571
14573}
14574
14575// Loads and stores less than 128-bits are already atomic; ones above that
14576// are doomed anyway, so defer to the default libcall and blame the OS when
14577// things go wrong.
14579 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
14580 return Size == 128;
14581}
14582
14583// Loads and stores less than 128-bits are already atomic; ones above that
14584// are doomed anyway, so defer to the default libcall and blame the OS when
14585// things go wrong.
14591
14592// For the real atomic operations, we have ldxr/stxr up to 128 bits,
14595 if (AI->isFloatingPointOperation())
14597
14598 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
14599 if (Size > 128) return AtomicExpansionKind::None;
14600 // Nand not supported in LSE.
14602 // Leave 128 bits to LLSC.
14603 return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
14604}
14605
14608 AtomicCmpXchgInst *AI) const {
14609 // If subtarget has LSE, leave cmpxchg intact for codegen.
14610 if (Subtarget->hasLSE())
14612 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
14613 // implement cmpxchg without spilling. If the address being exchanged is also
14614 // on the stack and close enough to the spill slot, this can lead to a
14615 // situation where the monitor always gets cleared and the atomic operation
14616 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
14617 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
14620}
14621
14623 AtomicOrdering Ord) const {
14624 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
14625 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
14626 bool IsAcquire = isAcquireOrStronger(Ord);
14627
14628 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
14629 // intrinsic must return {i64, i64} and we have to recombine them into a
14630 // single i128 here.
14631 if (ValTy->getPrimitiveSizeInBits() == 128) {
14633 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
14635
14636 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
14637 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
14638
14639 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
14640 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
14641 Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
14642 Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
14643 return Builder.CreateOr(
14644 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
14645 }
14646
14647 Type *Tys[] = { Addr->getType() };
14649 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
14651
14652 Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();
14653
14654 const DataLayout &DL = M->getDataLayout();
14655 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
14656 Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
14657
14658 return Builder.CreateBitCast(Trunc, EltTy);
14659}
14660
14662 IRBuilder<> &Builder) const {
14663 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
14664 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
14665}
14666
14668 Value *Val, Value *Addr,
14669 AtomicOrdering Ord) const {
14670 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
14671 bool IsRelease = isReleaseOrStronger(Ord);
14672
14673 // Since the intrinsics must have legal type, the i128 intrinsics take two
14674 // parameters: "i64, i64". We must marshal Val into the appropriate form
14675 // before the call.
14676 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
14678 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
14680 Type *Int64Ty = Type::getInt64Ty(M->getContext());
14681
14682 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
14683 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
14684 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
14685 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
14686 }
14687
14689 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
14690 Type *Tys[] = { Addr->getType() };
14692
14693 const DataLayout &DL = M->getDataLayout();
14694 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
14695 Val = Builder.CreateBitCast(Val, IntValTy);
14696
14697 return Builder.CreateCall(Stxr,
14698 {Builder.CreateZExtOrBitCast(
14699 Val, Stxr->getFunctionType()->getParamType(0)),
14700 Addr});
14701}
14702
14704 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
14705 return Ty->isArrayTy();
14706}
14707
14708bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
14709 EVT) const {
14710 return false;
14711}
14712
14713static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
14714 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
14716 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
14717 return IRB.CreatePointerCast(
14718 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
14719 Offset),
14720 IRB.getInt8PtrTy()->getPointerTo(0));
14721}
14722
14724 // Android provides a fixed TLS slot for the stack cookie. See the definition
14725 // of TLS_SLOT_STACK_GUARD in
14726 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
14727 if (Subtarget->isTargetAndroid())
14728 return UseTlsOffset(IRB, 0x28);
14729
14730 // Fuchsia is similar.
14731 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
14732 if (Subtarget->isTargetFuchsia())
14733 return UseTlsOffset(IRB, -0x10);
14734
14736}
14737
14739 // MSVC CRT provides functionalities for stack protection.
14740 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
14741 // MSVC CRT has a global variable holding security cookie.
14742 M.getOrInsertGlobal("__security_cookie",
14743 Type::getInt8PtrTy(M.getContext()));
14744
14745 // MSVC CRT has a function to validate security cookie.
14746 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
14747 "__security_check_cookie", Type::getVoidTy(M.getContext()),
14748 Type::getInt8PtrTy(M.getContext()));
14749 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
14750 F->setCallingConv(CallingConv::Win64);
14751 F->addAttribute(1, Attribute::AttrKind::InReg);
14752 }
14753 return;
14754 }
14756}
14757
14759 // MSVC CRT has a global variable holding security cookie.
14760 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
14761 return M.getGlobalVariable("__security_cookie");
14763}
14764
14766 // MSVC CRT has a function to validate security cookie.
14767 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
14768 return M.getFunction("__security_check_cookie");
14770}
14771
14773 // Android provides a fixed TLS slot for the SafeStack pointer. See the
14774 // definition of TLS_SLOT_SAFESTACK in
14775 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
14776 if (Subtarget->isTargetAndroid())
14777 return UseTlsOffset(IRB, 0x48);
14778
14779 // Fuchsia is similar.
14780 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
14781 if (Subtarget->isTargetFuchsia())
14782 return UseTlsOffset(IRB, -0x8);
14783
14785}
14786
14788 const Instruction &AndI) const {
14789 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
14790 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
14791 // may be beneficial to sink in other cases, but we would have to check that
14792 // the cmp would not get folded into the br to form a cbz for these to be
14793 // beneficial.
14794 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
14795 if (!Mask)
14796 return false;
14797 return Mask->getValue().isPowerOf2();
14798}
14799
14803 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
14804 SelectionDAG &DAG) const {
14805 // Does baseline recommend not to perform the fold by default?
14807 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
14808 return false;
14809 // Else, if this is a vector shift, prefer 'shl'.
14810 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
14811}
14812
14814 SDNode *N) const {
14816 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
14817 return false;
14818 return true;
14819}
14820
14822 // Update IsSplitCSR in AArch64unctionInfo.
14823 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
14824 AFI->setIsSplitCSR(true);
14825}
14826
14828 MachineBasicBlock *Entry,
14829 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
14830 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14831 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
14832 if (!IStart)
14833 return;
14834
14835 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
14836 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
14838 for (const MCPhysReg *I = IStart; *I; ++I) {
14839 const TargetRegisterClass *RC = nullptr;
14840 if (AArch64::GPR64RegClass.contains(*I))
14841 RC = &AArch64::GPR64RegClass;
14842 else if (AArch64::FPR64RegClass.contains(*I))
14843 RC = &AArch64::FPR64RegClass;
14844 else
14845 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
14846
14847 Register NewVR = MRI->createVirtualRegister(RC);
14848 // Create copy from CSR to a virtual register.
14849 // FIXME: this currently does not emit CFI pseudo-instructions, it works
14850 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
14851 // nounwind. If we want to generalize this later, we may need to emit
14852 // CFI pseudo-instructions.
14853 assert(Entry->getParent()->getFunction().hasFnAttribute(
14854 Attribute::NoUnwind) &&
14855 "Function should be nounwind in insertCopiesSplitCSR!");
14856 Entry->addLiveIn(*I);
14857 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
14858 .addReg(*I);
14859
14860 // Insert the copy-back instructions right before the terminator.
14861 for (auto *Exit : Exits)
14862 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
14863 TII->get(TargetOpcode::COPY), *I)
14864 .addReg(NewVR);
14865 }
14866}
14867
14869 // Integer division on AArch64 is expensive. However, when aggressively
14870 // optimizing for code size, we prefer to use a div instruction, as it is
14871 // usually smaller than the alternative sequence.
14872 // The exception to this is vector division. Since AArch64 doesn't have vector
14873 // integer division, leaving the division as-is is a loss even in terms of
14874 // size, because it will have to be scalarized, while the alternative code
14875 // sequence can be performed in vector form.
14876 bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
14877 return OptSize && !VT.isVector();
14878}
14879
14881 // We want inc-of-add for scalars and sub-of-not for vectors.
14882 return VT.isScalarInteger();
14883}
14884
14886 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
14887}
14888
14889unsigned
14891 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
14892 return getPointerTy(DL).getSizeInBits();
14893
14894 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
14895}
14896
14897void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
14900}
14901
14902// Unlike X86, we let frame lowering assign offsets to all catch objects.
14904 return false;
14905}
14906
14907bool AArch64TargetLowering::shouldLocalize(
14908 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
14909 switch (MI.getOpcode()) {
14910 case TargetOpcode::G_GLOBAL_VALUE: {
14911 // On Darwin, TLS global vars get selected into function calls, which
14912 // we don't want localized, as they can get moved into the middle of a
14913 // another call sequence.
14914 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
14915 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
14916 return false;
14917 break;
14918 }
14919 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
14920 // localizable.
14921 case AArch64::ADRP:
14922 case AArch64::G_ADD_LOW:
14923 return true;
14924 default:
14925 break;
14926 }
14928}
14929
14931 if (isa<ScalableVectorType>(Inst.getType()))
14932 return true;
14933
14934 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
14936 return true;
14937
14938 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
14939 if (isa<ScalableVectorType>(AI->getAllocatedType()))
14940 return true;
14941 }
14942
14943 return false;
14944}
14945
14946// Return the largest legal scalable vector type that matches VT's element type.
14950 "Expected legal fixed length vector!");
14951 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
14952 default:
14953 llvm_unreachable("unexpected element type for SVE container");
14954 case MVT::i8:
14955 return EVT(MVT::nxv16i8);
14956 case MVT::i16:
14957 return EVT(MVT::nxv8i16);
14958 case MVT::i32:
14959 return EVT(MVT::nxv4i32);
14960 case MVT::i64:
14961 return EVT(MVT::nxv2i64);
14962 case MVT::f16:
14963 return EVT(MVT::nxv8f16);
14964 case MVT::f32:
14965 return EVT(MVT::nxv4f32);
14966 case MVT::f64:
14967 return EVT(MVT::nxv2f64);
14968 }
14969}
14970
14971// Return a PTRUE with active lanes corresponding to the extent of VT.
14973 EVT VT) {
14976 "Expected legal fixed length vector!");
14977
14978 int PgPattern;
14979 switch (VT.getVectorNumElements()) {
14980 default:
14981 llvm_unreachable("unexpected element count for SVE predicate");
14982 case 1:
14983 PgPattern = AArch64SVEPredPattern::vl1;
14984 break;
14985 case 2:
14986 PgPattern = AArch64SVEPredPattern::vl2;
14987 break;
14988 case 4:
14989 PgPattern = AArch64SVEPredPattern::vl4;
14990 break;
14991 case 8:
14992 PgPattern = AArch64SVEPredPattern::vl8;
14993 break;
14994 case 16:
14995 PgPattern = AArch64SVEPredPattern::vl16;
14996 break;
14997 case 32:
14998 PgPattern = AArch64SVEPredPattern::vl32;
14999 break;
15000 case 64:
15001 PgPattern = AArch64SVEPredPattern::vl64;
15002 break;
15003 case 128:
15004 PgPattern = AArch64SVEPredPattern::vl128;
15005 break;
15006 case 256:
15007 PgPattern = AArch64SVEPredPattern::vl256;
15008 break;
15009 }
15010
15011 // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
15012 // use AArch64SVEPredPattern::all, which can enable the use of unpredicated
15013 // variants of instructions when available.
15014
15015 MVT MaskVT;
15016 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
15017 default:
15018 llvm_unreachable("unexpected element type for SVE predicate");
15019 case MVT::i8:
15021 break;
15022 case MVT::i16:
15023 case MVT::f16:
15025 break;
15026 case MVT::i32:
15027 case MVT::f32:
15029 break;
15030 case MVT::i64:
15031 case MVT::f64:
15033 break;
15034 }
15035
15036 return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
15038}
15039
15041 EVT VT) {
15043 "Expected legal scalable vector!");
15045 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
15046}
15047
15049 if (VT.isFixedLengthVector())
15050 return getPredicateForFixedLengthVector(DAG, DL, VT);
15051
15052 return getPredicateForScalableVector(DAG, DL, VT);
15053}
15054
15055// Grow V to consume an entire SVE register.
15057 assert(VT.isScalableVector() &&
15058 "Expected to convert into a scalable vector!");
15059 assert(V.getValueType().isFixedLengthVector() &&
15060 "Expected a fixed length vector operand!");
15061 SDLoc DL(V);
15062 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
15063 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
15064}
15065
15066// Shrink V so it's just big enough to maintain a VT's worth of data.
15069 "Expected to convert into a fixed length vector!");
15070 assert(V.getValueType().isScalableVector() &&
15071 "Expected a scalable vector operand!");
15072 SDLoc DL(V);
15073 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
15074 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
15075}
15076
15077// Convert all fixed length vector loads larger than NEON to masked_loads.
15078SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
15079 SDValue Op, SelectionDAG &DAG) const {
15080 auto Load = cast<LoadSDNode>(Op);
15081
15082 SDLoc DL(Op);
15083 EVT VT = Op.getValueType();
15085
15086 auto NewLoad = DAG.getMaskedLoad(
15087 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
15089 Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
15090 Load->getExtensionType());
15091
15092 auto Result = convertFromScalableVector(DAG, VT, NewLoad);
15093 SDValue MergedValues[2] = {Result, Load->getChain()};
15094 return DAG.getMergeValues(MergedValues, DL);
15095}
15096
15097// Convert all fixed length vector stores larger than NEON to masked_stores.
15098SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
15099 SDValue Op, SelectionDAG &DAG) const {
15100 auto Store = cast<StoreSDNode>(Op);
15101
15102 SDLoc DL(Op);
15103 EVT VT = Store->getValue().getValueType();
15105
15106 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
15107 return DAG.getMaskedStore(
15108 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
15109 getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
15110 Store->getMemOperand(), Store->getAddressingMode(),
15111 Store->isTruncatingStore());
15112}
15113
15114SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
15115 SDValue Op, SelectionDAG &DAG) const {
15116 EVT VT = Op.getValueType();
15117 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
15118
15119 SDLoc DL(Op);
15120 SDValue Val = Op.getOperand(0);
15122 Val = convertToScalableVector(DAG, ContainerVT, Val);
15123
15124 // Repeatedly truncate Val until the result is of the desired element type.
15125 switch (ContainerVT.getSimpleVT().SimpleTy) {
15126 default:
15127 llvm_unreachable("unimplemented container type");
15128 case MVT::nxv2i64:
15129 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
15130 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
15131 if (VT.getVectorElementType() == MVT::i32)
15132 break;
15134 case MVT::nxv4i32:
15135 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
15136 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
15137 if (VT.getVectorElementType() == MVT::i16)
15138 break;
15140 case MVT::nxv8i16:
15141 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
15142 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
15143 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
15144 break;
15145 }
15146
15147 return convertFromScalableVector(DAG, VT, Val);
15148}
15149
15150SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
15151 SelectionDAG &DAG,
15152 unsigned NewOp) const {
15153 EVT VT = Op.getValueType();
15154 SDLoc DL(Op);
15155 auto Pg = getPredicateForVector(DAG, DL, VT);
15156
15157 if (useSVEForFixedLengthVectorVT(VT)) {
15159
15160 // Create list of operands by convereting existing ones to scalable types.
15162 for (const SDValue &V : Op->op_values()) {
15163 if (isa<CondCodeSDNode>(V)) {
15164 Operands.push_back(V);
15165 continue;
15166 }
15167
15168 assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
15169 "Only fixed length vectors are supported!");
15170 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
15171 }
15172
15174 return convertFromScalableVector(DAG, VT, ScalableRes);
15175 }
15176
15177 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
15178
15180 for (const SDValue &V : Op->op_values()) {
15181 assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) &&
15182 "Only scalable vectors are supported!");
15183 Operands.push_back(V);
15184 }
15185
15186 return DAG.getNode(NewOp, DL, VT, Operands);
15187}
unsigned const MachineRegisterInfo * MRI
if(Register::isVirtualRegister(Reg)) return MRI -> getRegClass(Reg) ->hasSuperClassEq(&AArch64::GPR64RegClass)
static unsigned MatchRegisterName(StringRef Name)
static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG)
NarrowVector - Given a value in the V128 register class, produce the equivalent value in the V64 regi...
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static std::pair< SDValue, SDValue > splitInt128(SDValue N, SelectionDAG &DAG)
static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
static bool areExtractShuffleVectors(Value *Op1, Value *Op2)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static bool isREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
static SDValue performSRLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point divide by power of two into fixed-point to floating-point conversion.
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool canGuaranteeTCO(CallingConv::ID CC)
Return true if the calling convention is one that we can guarantee TCO for.
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static bool isTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N)
Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, bool &FromHi)
An EXTR instruction is made up of two shifts, ORed together.
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static unsigned getIntrinsicID(const SDNode *N)
static bool IsSVECntIntrinsic(SDValue S)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static PredicateConstraint parsePredicateConstraint(StringRef Constraint)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool isSignExtended(SDNode *N, SelectionDAG &DAG)
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
EXTR instruction extracts a contiguous chunk of bits from two existing registers viewed as a high/low...
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * UseTlsOffset(IRBuilder<> &IRB, unsigned Offset)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
static const unsigned PerfectShuffleTable[6561+1]
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
amdgpu Simplify well known AMD library false FunctionCallee Callee
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static const MCPhysReg GPRArgRegs[]
Function Alias Analysis Results
assume Assume Builder
This file contains the simple types necessary to represent the attributes associated with functions a...
SmallVector< MachineOperand, 4 > Cond
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< ShadowStackGC > C("shadow-stack", "Very portable GC for uncooperative code generators")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition Compiler.h:280
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
PropagateLiveness Given that RA is a live propagate it s liveness to any other values it uses(according to Uses). void DeadArgumentEliminationPass
else return RetTy
#define LLVM_DEBUG(X)
Definition Debug.h:122
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
lazy value info
#define F(x, y, z)
Definition MD5.cpp:56
#define I(x, y, z)
Definition MD5.cpp:59
#define G(x, y, z)
Definition MD5.cpp:57
mir Rename Register Operands
static Value * getNumElements(BasicBlock *Preheader, Value *BTC)
unsigned const TargetRegisterInfo * TRI
unsigned Reg
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
uint32_t Size
Definition Profile.cpp:46
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition Statistic.cpp:50
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
static const int BlockSize
Definition TarWriter.cpp:33
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:421
static constexpr int Concat[]
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setArgumentStackToRestore(unsigned bytes)
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
static bool hasSVEArgsOrReturn(const MachineFunction *MF)
unsigned getPrefLoopLogAlignment() const
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getPrefFunctionLogAlignment() const
bool isMisaligned128StoreSlow() const
const AArch64InstrInfo * getInstrInfo() const override
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
unsigned getMinSVEVectorSizeInBits() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isXRegisterReserved(size_t i) const
bool predictableSelectIsExpensive() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns true if the given (atomic) store should be expanded by the IR-level AtomicExpand pass into an...
int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
Value * emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
Value * getIRStackGuard(IRBuilder<> &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Value * getSafeStackPointerLocation(IRBuilder<> &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
EVT is not used in-tree, but is used by out-of-tree target.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, unsigned Align=1, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, bool *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override
Return true if SHIFT instructions should be expanded to SHIFT_PARTS instructions, and false if a libr...
Value * emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
APInt bitcastToAPInt() const
Definition APFloat.h:1130
bool isPosZero() const
Definition APFloat.h:1214
void dump() const
Definition APFloat.cpp:4864
Class for arbitrary precision integers.
Definition APInt.h:69
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:953
static APInt getAllOnesValue(unsigned numBits)
Get the all-ones value.
Definition APInt.h:566
unsigned countTrailingZeros() const
Count the number of trailing zero bits.
Definition APInt.h:1692
unsigned logBase2() const
Definition APInt.h:1808
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:368
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:468
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition APInt.h:666
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition APInt.h:654
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1635
an instruction to allocate memory on the stack
This class represents an incoming formal argument to a Function.
Definition Argument.h:29
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
@ Nand
*p = ~(old & v)
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but may be faster.
LLVM Basic Block Representation.
Definition BasicBlock.h:59
const BlockAddress * getBlockAddress() const
A "pseudo-class" with methods for operating on BUILD_VECTORs.
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
unsigned getNextStackOffset() const
getNextStackOffset - Return the next stack offset such that all stack slots satisfy their alignment r...
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
CCValAssign - Represent assignment of one arg/retval to a location.
Value * getArgOperand(unsigned i) const
unsigned getNumArgOperands() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
This is the shared class of boolean and integer constants.
Definition Constants.h:77
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:131
uint64_t getZExtValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:111
bool isBigEndian() const
Definition DataLayout.h:234
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.h:490
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:33
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:65
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:616
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:644
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:219
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:230
const Function & getFunction() const
Definition Function.h:135
arg_iterator arg_begin()
Definition Function.h:720
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.h:330
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
bool hasExternalWeakLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
This class is used to represent ISD::LOAD nodes.
MCRegisterInfo base class - We assume that the target defines a static array of MCRegisterDesc object...
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static mvt_range fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
static mvt_range fp_scalable_vector_valuetypes()
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static mvt_range integer_valuetypes()
static mvt_range integer_fixedlen_vector_valuetypes()
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static mvt_range integer_scalable_vector_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isFixedLengthVector() const
static MVT getVectorVT(MVT VT, unsigned NumElements)
static mvt_range fp_fixedlen_vector_valuetypes()
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static mvt_range fp_valuetypes()
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setStackID(int ObjectIdx, uint8_t ID)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
void computeMaxCallFrameSize(const MachineFunction &MF)
Computes the maximum size of a callframe and the AdjustsStack property.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
bool isVolatile() const
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
unsigned getAlignment() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.cpp:397
void dump() const
Definition Pass.cpp:131
Class to represent pointers.
Type * getElementType() const
Wrapper class representing virtual and physical registers.
Definition Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
TypeSize getScalarValueSizeInBits() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=None, int Offset=0, unsigned TargetFlags=0)
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, uint64_t Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void addCallSiteInfo(const SDNode *CallNode, CallSiteInfoImpl &&CallInfo)
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
StackOffset is a wrapper around scalable and non-scalable offsets and is used in several functions su...
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:57
std::enable_if_t< std::numeric_limits< T >::is_signed, bool > getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:511
LLVM_NODISCARD StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition StringRef.h:713
LLVM_NODISCARD size_t size() const
size - Get the string size.
Definition StringRef.h:160
Class to represent struct types.
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:356
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setTargetDAGCombine(ISD::NodeType NT)
Targets should invoke this method for each target independent node that they want to provide a custom...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual Value * getSafeStackPointerLocation(IRBuilder<> &IRB) const
Returns the target-specific address of the unsafe stack pointer.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilder<> &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
void setIndexedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
bool isOperationLegalOrCustom(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
void setCondCodeAction(ISD::CondCode CC, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
void setIndexedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned EmitCallSiteInfo
The flag enables call site info production.
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fuse-fp-ops=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:45
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition Triple.h:569
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition Triple.h:542
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:80
uint64_t getKnownMinSize() const
Definition TypeSize.h:145
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:187
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:231
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:225
@ FloatTyID
32-bit floating point type
Definition Type.h:59
@ DoubleTyID
64-bit floating point type
Definition Type.h:60
static Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:170
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition Type.cpp:234
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:113
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:127
bool isFloatingPointTy() const
Return true if this is one of the six floating-point types.
Definition Type.h:162
A Use represents the edge between a Value definition and its users.
Definition Use.h:44
Value * getOperand(unsigned i) const
Definition User.h:169
unsigned getNumOperands() const
Definition User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:244
User * user_back()
Definition Value.h:404
Base class of all SIMD vector types.
Type * getElementType() const
Implementation for an ilist node.
Definition ilist_node.h:39
self_iterator getIterator()
Definition ilist_node.h:81
#define UINT64_MAX
Definition DataTypes.h:77
#define INT64_MAX
Definition DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
const unsigned NeonBitsPerVector
static constexpr unsigned SVEBitsPerBlock
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:80
@ AArch64_SVE_VectorCall
Calling convention between AArch64 SVE functions.
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:87
@ Fast
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:42
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
@ C
C - The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:620
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:220
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition ISDOpcodes.h:903
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition ISDOpcodes.h:899
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:427
@ FLT_ROUNDS_
FLT_ROUNDS_ - Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest 2 Round to ...
Definition ISDOpcodes.h:726
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:234
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:498
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:585
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition ISDOpcodes.h:932
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:253
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:223
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition ISDOpcodes.h:817
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:650
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:431
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:183
@ GlobalAddress
Definition ISDOpcodes.h:71
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:657
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:484
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:342
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:559
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:239
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:744
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:213
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:72
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:644
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:401
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:528
@ ADDROFRETURNADDR
Definition ISDOpcodes.h:89
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition ISDOpcodes.h:988
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition ISDOpcodes.h:811
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition ISDOpcodes.h:762
@ BR_CC
BR_CC - Conditional branch.
Definition ISDOpcodes.h:854
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:288
@ BRIND
BRIND - Indirect branch.
Definition ISDOpcodes.h:838
@ BR_JT
BR_JT - Jumptable branch.
Definition ISDOpcodes.h:842
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:310
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:597
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:206
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:535
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition ISDOpcodes.h:928
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:284
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:540
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:576
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:520
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:511
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:476
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:187
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:647
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition ISDOpcodes.h:982
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:612
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition ISDOpcodes.h:794
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:292
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition ISDOpcodes.h:827
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:665
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition ISDOpcodes.h:545
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:729
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:606
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:400
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:87
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:394
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:416
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:393
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition ISDOpcodes.h:807
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:703
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition ISDOpcodes.h:959
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:142
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:551
@ TRAP
TRAP - Trapping instruction.
Definition ISDOpcodes.h:979
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:168
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:263
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:465
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:717
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:653
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition ISDOpcodes.h:923
@ BRCOND
BRCOND - Conditional branch.
Definition ISDOpcodes.h:848
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:633
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:59
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:441
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:301
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:176
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:456
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
bool match(Val *V, const Pattern &P)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
class_match< UndefValue > m_Undef()
Match an arbitrary undef constant.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
match_combine_or< CastClass_match< OpTy, Instruction::ZExt >, CastClass_match< OpTy, Instruction::SExt > > m_ZExtOrSExt(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
initializer< Ty > init(const Ty &Val)
CodeModel::Model getCodeModel()
constexpr double e
Definition MathExtras.h:58
This class represents lattice values for constants.
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1511
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool operator==(uint64_t V1, const APInt &V2)
Definition APInt.h:2027
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:456
bool RetCC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:177
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isAcquireOrStronger(AtomicOrdering ao)
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:497
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1326
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:603
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:486
unsigned M1(unsigned Val)
Definition VE.h:353
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1498
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:597
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned countLeadingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition MathExtras.h:226
constexpr size_t array_lengthof(T(&)[N])
Find the length of an array.
Definition STLExtras.h:1335
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition MathExtras.h:157
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:132
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition ArrayRef.h:458
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition Error.cpp:140
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:474
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
AtomicOrdering
Atomic ordering for LLVM's memory model.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition DAGCombine.h:15
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:461
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:158
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition MathExtras.h:673
@ Invalid
Denotes invalid value.
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr unsigned BitWidth
bool isReleaseOrStronger(AtomicOrdering ao)
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1518
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:962
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
Helper structure to be able to read SetCC information.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getScalarSizeInBits() const
Definition ValueTypes.h:321
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:331
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:131
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:141
ElementCount getVectorElementCount() const
Definition ValueTypes.h:297
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:315
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:391
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:260
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:187
EVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:120
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition ValueTypes.h:374
bool isFixedLengthVector() const
Definition ValueTypes.h:166
std::string getEVTString() const
This function returns value type as a string, e.g. "i32".
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:156
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:267
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:162
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:272
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:151
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:108
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:280
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:383
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:146
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:182
Describes a register that needs to be forwarded from the prologue to a musttail call.
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:39
Structure used to represent pair of argument number after call lowering and register used to transfer...
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:119
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowReassociation(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This structure contains all information that is necessary for lowering calls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64